# Scrape from Reddit 

_By Tim Dwyer_ 


We're going to scrape posts from the `all` page on Reddit. Here we use the official Reddit API. To do this I registered a scripting app with Reddit to obtain the OAuth credentials. This information isn't present in this repository but if you went and registered your own app, you could use this code for the same purpose. 

In [1]:
import requests
import requests.auth
import json
import time
from calendar import monthrange

In [2]:
def get_new_token(app_id, secret, user, pw):
    client_auth = requests.auth.HTTPBasicAuth(app_id, secret)
    post_data = {"grant_type": "password", "username": user, "password": pw}
    headers = {"User-Agent": f"data science class project .1 u/{user}"}
    
    response = requests.post("https://www.reddit.com/api/v1/access_token",
                         auth=client_auth, data=post_data, headers=headers)
    
    return response.json()

def print_time():
    cur_time = time.localtime()
    cur_hour, cur_min, cur_sec = cur_time.tm_hour, cur_time.tm_min, cur_time.tm_sec
    return print(cur_hour, cur_min, cur_sec)

def reddit_scrape(subreddit, app_id, secret, user, pw, pages=5, save_to_json=True, verbose=True, timer=True):
    after = None
    posts = []
    params = {}
    ten_percentile = 0
    prev_query_time = 0
    start_time = time.time()
    response = get_new_token(app_id, secret, user, pw)
    user_agent = "data science class project 0.1 u/timdwyer_app"
    my_headers = {
        "Authorization": f"{response['token_type']} {response['access_token']}",
        "User-Agent": user_agent
     }
    
    if timer:
        print_time()
        
    for page in range(pages):
        if after:
            params['after'] = after

        time_elapsed = time.time() - prev_query_time
        if time_elapsed <= 1:
            time.sleep(1 - time_elapsed)

        token_time = time.time() - start_time
        if token_time >= 3600:
                response = get_new_token(app_id, secret, user, pw)
                my_headers = {
                    "Authorization": f"{response['token_type']} {response['access_token']}",
                    "User-Agent": user_agent
                }

        res = requests.get(f'https://reddit.com/r/{subreddit}.json', params=params, headers=my_headers)
        prev_query_time = time.time()

        cur_page = res.json()
        posts.extend(cur_page['data'].get('children'))
        after = cur_page['data']['after']


        if verbose and page/pages > ten_percentile:
            print_time()
            print(page)
            ten_percentile += .1

    if timer:
        print_time()


    if save_to_json:
        with open(f'./data/json/{subreddit}.json', 'w+') as file:
            json.dump(posts, file)
    else:
        return posts


The list `info` contains my `app_id, app_secret, user_name, password` in sequence. This cell will not run as written since the file with this information is not contained in this repository as it is not for sharing. If you would like to test that this code runs you should create your own reddit app, and pass your secret and id along. 

In [3]:
info = []
with open('./reddit_login.txt', 'r') as file:
    for line in file:
        info.append(line.split()[1])

In [4]:
reddit_scrape('all', *info, pages=500, verbose=False, timer=False)

In [5]:
with open('./data/json/all.json') as file:
    threads = json.load(file)
    print(len(set( thread['data']['title'] for thread in threads )), len(threads))

# Scrape from Pushshift API

We ended up with fewer unique posts in than it seems at first `all`. Due to a constraint in how results are returned by the Reddit API, we can only get this many from `all` at any given time, and the limit is about $1,000$ on other subreddits.

One way to get around this would be to use the Reddit Search feature to create new listings that we could associate with specific time intervals. Instead we'll shift gears and use the [Pushshift API](https://pushshift.io/). This is a third party API which allows us to search Reddit threads by time stamp without having to go through the extra step above. 

In [6]:
def reddit_scrape_push(subreddit, num_days, start_date, save_to_json=True, verbose=True, timer=True):   
#   start_date = '2018_01_01_00_00_00' should be formatted like this as input.
    pattern = '%Y_%m_%d_%H_%M_%S'
    
    after = int(time.mktime(time.strptime(start_date, pattern)))
    before = after + 3600
    posts = []
    params = {}
    
    ten_percentile = 0
    prev_query_time = 0
    
    pushshift_url = 'https://api.pushshift.io/reddit/search/submission/'
    params = {
        'subreddit':subreddit,
        'after':after,
        'before':before,
        'size':500
         }

    if timer:
        print_time()
    
    for day in range(num_days):
        for hour in range(24):
            time_elapsed = time.time() - prev_query_time
            if time_elapsed <= 1:
                time.sleep(1 - time_elapsed)

            res = requests.get(pushshift_url, params=params)
            prev_query_time = time.time()

            cur_page = res.json()
            posts.extend(cur_page['data'])

            params['after'] = params['before']
            params['before'] = params['before'] + 3600

        if verbose and day/num_days > ten_percentile:
            print_time()
            print(day)
            ten_percentile += .1

        time.sleep(1)
    
    if timer:
        print_time()

    if save_to_json:
        with open(f'./data/json/pushshift_{subreddit}_{start_date}.json', 'w+') as file:
            json.dump(posts, file)
    else:
        return posts


def year_scrape_push(subreddit, year):
    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
    days_in_month = {month:monthrange(year, int(month))[1] for month in months}
    
    for month in months:
            reddit_scrape_push(subreddit, days_in_month[month], f'{year}_{month}_01_00_00_00', 
                       save_to_json=True, verbose=False, timer=False)
            
    posts = []
    for month in months:
        start_date = f'{year}_{month}_01_00_00_00'
        with open(f'./data/json/pushshift_{subreddit}_{start_date}.json', 'r') as file:
            months_posts = json.load(file)
            posts.extend(months_posts)
    
    with open(f'./data/json/pushshift_{subreddit}_{year}.json', 'w+') as file:
        json.dump(posts, file)

In [7]:
subreddits = [
    'math',
    'learnmath',
    
    'python',
    'learnpython',
    
    'datascience',
    'learnmachinelearning',
]

for subreddit in subreddits:
    year_scrape_push(subreddit, 2017)