# Scraping scripts

In [10]:
import pandas as pd
import requests
import json
import time
import datetime

## Setting up the urls for requests
The reddit data can be accessed for each subreddit using the following format: `http://www.reddit.com/r/{subreddit}/hot.json`. Reddit doesn't allow the default Python user so I have to set my own (here, `JonBot 0.1`).

In [1]:
url_base = "http://www.reddit.com/"

slug_hot = "hot.json"

slug_bpt = "r/BlackPeopleTwitter/" # an optional intermediate slug to throw in to view a specific subreddit's 'hot' page
slug_wpt = "r/WhitePeopleTwitter/"

user = {'User-agent': 'JonBot 0.1'} # I need a User-agent to get in

## Scraping r/BlackPeopleTwitter
The script below sets up an empty dictionary and chooses what extra features to extract from each top-level comment. It then scrapes the number of comments, post title, top-level comment, upvotes for that comment, and the time that comment was created for `n_scrapes` posts.

In [3]:
list_of_dictionaries = []
aft = ''
features = ['created_utc', 'body', 'ups']
n_scrapes = 250

###
# Looping through each post requires two requests.
###
for i in range(n_scrapes):
    j = i % 25
    this_dict = {}
    url_bpt = url_base + slug_bpt + slug_hot + aft
    res = requests.get(url_bpt, headers = user)   # First request: get information about the post
    data = res.json()

    slug_bpt_id = data['data']['children'][j]['data']['id']
    comments_this_post = data['data']['children'][j]['data']['num_comments']
    post_title = data['data']['children'][j]['data']['title']
    
    if (i+1)%25==0: # Gives an update every 25 posts, and moves to the next page (using the aft variable).
        print("{} posts scraped!".format(i+1))
        aft = '?after='+data['data']['after']
    
    this_dict['comments_this_post'] = comments_this_post
    this_dict['post_title'] = post_title

    # Now that the post info is added, we move on to the comment
    
    url_bpt_comments = url_base + slug_bpt + 'comments/' + slug_bpt_id + '.json'
    res = requests.get(url_bpt_comments, headers = user) # Second request: get information about the top comment.
    data = res.json()
    try:
        comment_data = data[1]['data']['children'][0]['data'] # If there isn't a comment, pass.
    except:
        pass
    for feature in features:
        this_dict[feature] = comment_data[feature] # add to the dictionary
    this_dict['current_time'] = time.time() # an extra column for the time the post was scraped.
    time.sleep(2) # Keeping us from getting booted out.
    list_of_dictionaries.append(this_dict)  

###
# Finally, we save the file with the current month, day, and hour
###

now = datetime.datetime.now()
pd.DataFrame(list_of_dictionaries).to_csv('Bpt_250_{}-{}_{}{}.csv'.format(now.month, now.day, now.hour, now.minute))

KeyboardInterrupt: 

## The same for r/WhitePeopleTwitter

In [None]:
list_of_dictionaries = []
aft = ''
features = ['created_utc', 'body', 'ups']
n_scrapes = 250

###
# Looping through each post requires two requests.
###
for i in range(n_scrapes):
    j = i % 25
    this_dict = {}
    url_wpt = url_base + slug_wpt + slug_hot + aft
    res = requests.get(url_wpt, headers = user)   # First request: get information about the post
    data = res.json()

    slug_wpt_id = data['data']['children'][j]['data']['id']
    comments_this_post = data['data']['children'][j]['data']['num_comments']
    post_title = data['data']['children'][j]['data']['title']
    
    if (i+1)%25==0: # Gives an update every 25 posts, and moves to the next page (using the aft variable).
        print("{} posts scraped!".format(i+1))
        aft = '?after='+data['data']['after']
    
    this_dict['comments_this_post'] = comments_this_post
    this_dict['post_title'] = post_title

    # Now that the post info is added, we move on to the comment
    
    url_wpt_comments = url_base + slug_wpt + 'comments/' + slug_wpt_id + '.json'
    res = requests.get(url_wpt_comments, headers = user) # Second request: get information about the top comment.
    data = res.json()
    try:
        comment_data = data[1]['data']['children'][0]['data'] # If there isn't a comment, pass.
    except:
        pass
    for feature in features:
        this_dict[feature] = comment_data[feature] # add to the dictionary
    this_dict['current_time'] = time.time() # an extra column for the time the post was scraped.
    time.sleep(2) # Keeping us from getting booted out.
    list_of_dictionaries.append(this_dict)  

###
# Finally, we save the file with the current month, day, and hour
###

now = datetime.datetime.now()
pd.DataFrame(list_of_dictionaries).to_csv('Wpt_250_{}-{}_{}{}.csv'.format(now.month, now.day, now.hour, now.minute))