# Reddit API Data Collection
###### By: Nick Gayliard

In [None]:
import requests
import time
import pandas as pd
import numpy as np
import re
import json

### GET requests

In [None]:
url = 'https://www.reddit.com/r/nba.json'

req = requests.get(url)

In [None]:
req

https://httpstatuses.com/429

### Requests with parameters / queries

The reddit API gave us a 429 (too many requests) error without a 'User-agent' header assigned. That value can be anything in the case of the reddit API. This can differ from API to API, or be completely unneeded. Many APIs will require a private key, given to you by the company. Be sure to PROTECT your API keys, especially ones attached to bank accounts / credit cards (e.g. Amazon Web Services and Google API keys)

In [None]:
req = requests.get(url, headers = {'User-agent' : 'Nick'})

In [None]:
req.status_code

#### Sample URL with a query

In [None]:
req2 = requests.get(url, headers = {'User-agent' : 'Nick'}, params = {'after' : 't3_bor3tn'})

In [None]:
req2.status_code

##### Everything after the '?' symbol in the URL is a query for specific information from the API. You need to check the API documentation to see what variables you can use to grab what information.

In [None]:
req2.url

In [None]:
req2.headers

In [None]:
# json.loads(req.content).keys()

### Let's check out our request content

In [None]:
# Lots of crazy bytecode 

req.content

#### Convert it to json and navigate through the json to the data we want

In [None]:
page_pull = req.json()

In [None]:
page_pull

In [None]:
page_pull.keys()

In [None]:
page_pull['data']

In [None]:
page_pull['data'].keys()

In [None]:
page_pull['data']['children']

In [None]:
page_pull['data']['children'][1]

In [None]:
len(page_pull['data']['children'])

name, subreddit, selftext, title, num_comments, url, score

In [None]:
# When you are indexing deeply into json, it can help to make variable names for certain levels of indexing
# that you plan on reusing, to improve readability and make sure you don't make indexing errors as often

post_list = page_pull['data']['children']

In [None]:
post_list[1].keys()

In [None]:
for post in post_list:
    print(post['data']['name'])

In [None]:
post_list[0]['data']['title']

### Scrape and build a dictionary to make a dataframe

In [None]:
# Sloppy way! Too much indexing in loop

post_dict = {}

for count, post in enumerate(post_list):
    post_dict[post_list[count]['data']['name']] = [post_list[count]['data']['title'], post_list[count]['data']['num_comments']]

In [None]:
# CLEAN WAY - using an indexer variable!!

post_dict = {}

for count, post in enumerate(post_list):
    post_indexer = post_list[count]['data']
    post_dict[post_indexer['name']] = [post_indexer['title'], post_indexer['num_comments']]

In [None]:
df = pd.DataFrame(post_dict).T
df.columns = ['title', 'num_comments']
df

## Put it in a function!

In [None]:
# function to scrape reddit page (takes a reddit .json url)
# returns posts 

headers = {'User-agent' : 'Nick'}

def scraper_bike(url):
    posts = []
    after = {}

    for page in range(40):
        params = {'after' : after}
        url = url
        pagepull = requests.get(url = url, params = params, headers = headers)
        page_dict = pagepull.json()
        posts.extend(page_dict['data']['children'])
        after = page_dict['data']['after']
        # sleep is a best practice (probably not necessary for such a small scrape)
        time.sleep(.2)
        
    return posts

In [None]:
nba_post_list = scraper_bike('https://www.reddit.com/r/nba.json')

In [None]:
len(nba_post_list)

In [None]:
# function to convert posts to DataFrame - won't allow duplicate posts since unique id 'name' is set as index
# Extract: name (as index) and subreddit, selftext, title (as columns)

def posts_to_df(post_list):
    post_dict = {}
    
    for i, post in enumerate(post_list):
        ind = post_list[i]['data']
        post_dict[ind['name']] = [ind['subreddit'], ind['title'], ind['selftext']]

    df_name = pd.DataFrame(post_dict)
    df_name = df_name.T
    df_name.columns = ['subreddit', 'title', 'selftext'] #'selftext'
    
    return df_name

In [None]:
posts_to_df(nba_post_list)

## Couple extra functions for simplicity in running

In [None]:
# takes scraper function and url - outputs dataframe

def scrape_to_df(scrape_func, url):
    
    return posts_to_df(scrape_func(url))

### Function to scrape and save to csv. HIGHLY recommended when gathering data online that you want to ensure you maintain a copy of locally (and remotely if you want to be secure)

In [None]:
# NOTE: YOU NEED A CSV ALREADY MADE TO SAVE TO IN THIS CASE. 
# YOU COULD ADD CODE TO CREATE A NEW CSV IF NONE EXISTS

# scrape, import csv, concat, drop duplicate, and output to csv

# takes in scraper function, url, csv filename to import, csv filename to output

# Outputs - Concatenated DataFrame as csv

def scrape_add(scrape_func, url, import_file, export_file):
    
    scrape_df = posts_to_df(scrape_func(url))
    
    imported_df = pd.read_csv(import_file, index_col = 'Unnamed: 0')
    
    concat_df = pd.concat([imported_df, scrape_df])
    
    concat_df = concat_df[~concat_df.index.duplicated(keep='first')]
    
    concat_df.to_csv(export_file)