# Step 1: Web Scraping
Here, we use an api from [Pushshift.io](https://github.com/pushshift/api) to scrape Reddit posts and convert them to a JSON file.  
**NOTE:** This notebook will take approximately 6h to run to completion, though this can be reduced by changing the date range in cell #3.

In [1]:
N_POSTS = 1000

def getUrl(start, end):
    startEpoch = int(start.timestamp())
    endEpoch = int(end.timestamp())
    return f'https://api.pushshift.io/reddit/submission/search/?size={N_POSTS}&after={startEpoch}&before={endEpoch}&sort_type=score&sort=desc&subreddit=stocks'

In [2]:
#https://stackoverflow.com/questions/40748687/python-api-rate-limiting-how-to-limit-api-calls-globally
import requests
import json

CALLS = 60
RATE_LIMIT = 60

#@sleep_and_retry
#@limits(calls=CALLS, period=RATE_LIMIT)
def getData(urlStr):
    response = requests.get(urlStr)
    if response.status_code != 200:
        raise Exception('API response: {}'.format(response.status_code))
    return response.json()

In [3]:
# GET /hello/world
#https://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python
from datetime import date, timedelta, datetime
import time

def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield ((start_date + timedelta(n)), (start_date + timedelta(n+1)))

start_date = datetime(2013, 1, 1)
end_date = datetime(2013, 1, 3)

    
for (start, end) in daterange(start_date, end_date):
    while True:
        try:
            data = getData(getUrl(start, end))['data']
            filename = start.strftime("%Y-%m-%d") + '.json'
            with open('../data/out/' + filename, 'w') as f:
                f.write(json.dumps(data))
            print(filename + ' done')
        except Exception as e:
            time.sleep(.1)
            continue
        break
    


2013-01-01.json done
2013-01-02.json done


In [4]:
public_key = 'r8zf7RTqSoHCq8Wiidi81Q'
secret_key = 'LtxuY4A-ur1SWt-HLT7xS1qgoFon_g'

In [5]:
import praw

reddit = praw.Reddit(
    client_id=public_key,
    client_secret=secret_key,
    user_agent="my user agent",
)


In [6]:
def getDataForSumbission(submission):
    submission.comments.replace_more(limit=None)
    datas = []
    for comment in submission.comments.list():
        data = {}
        data['time'] = comment.created_utc
        data['submission_id'] = submission.id
        data['comment_id'] = comment.id
        data['body'] = comment.body
        data['score'] = comment.score
        data['num_replies'] = len(comment.replies)
        data['is_root'] = 1 if comment.is_root else 0
        data['submission_score'] = submission.score
        data['submission_ratio'] = comment.score/submission.score if submission.score != 0 else 100
        data['submission_title'] = submission.title
        data['submission_text'] = submission.selftext
        if not comment.is_root:
            parent = comment.parent()
            data['parent_text'] = parent.body
            data['parent_score'] = parent.score
            data['parent_ratio'] = comment.score/parent.score if parent.score != 0 else -999
        else:
            data['parent_text'] = submission.title + ' ' + submission.selftext
            data['parent_score'] = data['submission_score']
            data['parent_ratio'] = data['submission_ratio']        
        
        datas.append(data)

    return datas

In [7]:
import os
import json
i = 0
for file in os.listdir("../data/out/")[:]:
    i += 1
    with open('../data/out/' + file) as f:
        datas = json.load(f)
        output = []
        for data in datas:
            url = data['full_link']
            id = data['id']
            submission = reddit.submission(id=id)
            result = getDataForSumbission(submission)
            output.extend(result)
        with open('../data/comments/' + file, 'w') as f:
            f.write(json.dumps(output))
    print(str(i) + ': ' + file)
    #break

1: 2013-01-01.json
2: 2013-01-02.json
