# Reddit API Data Collection
###### By: Nick Gayliard

In [1]:
import requests
import time
import pandas as pd
import numpy as np
import re
import json

### GET requests

In [2]:
url = 'https://www.reddit.com/r/nba.json'

req = requests.get(url)

In [3]:
req

<Response [429]>

https://httpstatuses.com/429

### Requests with parameters / queries

The reddit API gave us a 429 (too many requests) error without a 'User-agent' header assigned. That value can be anything in the case of the reddit API. This can differ from API to API, or be completely unneeded. Many APIs will require a private key, given to you by the company. Be sure to PROTECT your API keys, especially ones attached to bank accounts / credit cards (e.g. Amazon Web Services and Google API keys)

In [4]:
req = requests.get(url, headers = {'User-agent' : 'Nick'})

In [5]:
req.status_code

200

#### Sample URL with a query

In [6]:
req2 = requests.get(url, headers = {'User-agent' : 'Nick'}, params = {'after' : 't3_bor3tn'})

In [7]:
req2.status_code

200

##### Everything after the '?' symbol in the URL is a query for specific information from the API. You need to check the API documentation to see what variables you can use to grab what information.

In [8]:
req2.url

'https://www.reddit.com/r/nba.json?after=t3_bor3tn'

In [9]:
req2.headers

{'Content-Type': 'application/json; charset=UTF-8', 'x-ua-compatible': 'IE=edge', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'set-cookie': 'loid=00000000003seiksp6.2.1558103534528.Z0FBQUFBQmMzc1h1VGx4eVBfUkxHWk8wRk5tdFdHQ09hUVBqaTYwM091RWRnRzhfNnZaajRlQkVITUY0NEMtdFBmWUd5RDAyc2VlX2VxX3FYM2JZdGtkeUhvdUd6RFB4Q2R5cUlUS2lYSmkyczRfQ3RFVkhoaEI5VmY3bzFvZkZuMTBVUTNxcDZZV1g; Domain=reddit.com; Max-Age=63071999; Path=/; expires=Sun, 16-May-2021 14:32:14 GMT; secure, session_tracker=zp75oTm5Sk0Ymq9UMd.0.1558103534528.Z0FBQUFBQmMzc1h1S21fbm5iWE9MVWY0S01kM2FUalFaN0ZVdi1kdWJKcm5NZDJfNF9WVG9lbmtfajRab2ZiOHoxcFh1NTAwZlF5QUdQUWFmZ0JDMjEzeGQxcWxuMXA3YjZIQXIxbmg0MGQwdFE4UjJtS19WN3lNb2pESXJ2TEpBUVA5OUQ2MnVKLXo; Domain=reddit.com; Max-Age=7199; Path=/; expires=Fri, 17-May-2019 16:32:14 GMT; secure, edgebucket=UlXLX4DdPTfW6FhxmT; Domain=reddit.com; Max-Age=63071999; Path=/;  secure', 'access-control-allow-origin': '*', 'access-control-expose-he

In [10]:
# json.loads(req.content).keys()

### Let's check out our request content

In [11]:
# Lots of crazy bytecode 

req.content

b'{"kind": "Listing", "data": {"modhash": "", "dist": 27, "children": [{"kind": "t3", "data": {"approved_at_utc": null, "subreddit": "nba", "selftext": "# Today\'s Games:\\n\\n|Tip-off|Away||Home||GDT|PGT|\\n|:--|:--|:-:|:--|--:|:-:|:-:|\\n|9:00 PM ET|[Portland Trail Blazers](/r/ripcity)|&gt;!111!&lt; at &gt;!114!&lt;|[Golden State Warriors](/r/warriors)|FINAL|[(Link)](https://reddit.com/r/nba/comments/bpk3bc)||\\n\\n# Yesterday\'s Games:\\n\\n|Tip-off|Away||Home||GDT|PGT|\\n|:--|:--|:-:|:--|--:|:-:|:-:|\\n|8:30 PM ET|[Toronto Raptors](/r/torontoraptors)|&gt;!100!&lt; at &gt;!108!&lt;|[Milwaukee Bucks](/r/mkebucks)|FINAL|||\\n\\n# Top Highlights:\\n\\n0. [Ryen Russillo: \\"Stephen A came out and said that people close to Jeanie Buss were saying you should trade LeBron and we found it was people from her spin class.\\"](https://streamable.com/6sc4p) | [(Comments)](https://reddit.com/r/nba/comments/bphapo)\\n\\n0. [ESPN\'s Get Up host Mike Greenberg tries to get Jalen Rose to talk about 

#### Convert it to json and navigate through the json to the data we want

In [12]:
page_pull = req.json()

In [13]:
page_pull

{'kind': 'Listing',
 'data': {'modhash': '',
  'dist': 27,
  'children': [{'kind': 't3',
    'data': {'approved_at_utc': None,
     'subreddit': 'nba',
     'selftext': '# Today\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n|9:00 PM ET|[Portland Trail Blazers](/r/ripcity)|&gt;!111!&lt; at &gt;!114!&lt;|[Golden State Warriors](/r/warriors)|FINAL|[(Link)](https://reddit.com/r/nba/comments/bpk3bc)||\n\n# Yesterday\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n|8:30 PM ET|[Toronto Raptors](/r/torontoraptors)|&gt;!100!&lt; at &gt;!108!&lt;|[Milwaukee Bucks](/r/mkebucks)|FINAL|||\n\n# Top Highlights:\n\n0. [Ryen Russillo: "Stephen A came out and said that people close to Jeanie Buss were saying you should trade LeBron and we found it was people from her spin class."](https://streamable.com/6sc4p) | [(Comments)](https://reddit.com/r/nba/comments/bphapo)\n\n0. [ESPN\'s Get Up host Mike Greenberg tries to get Jalen Rose to talk about the

In [14]:
page_pull.keys()

dict_keys(['kind', 'data'])

In [15]:
page_pull['data']

{'modhash': '',
 'dist': 27,
 'children': [{'kind': 't3',
   'data': {'approved_at_utc': None,
    'subreddit': 'nba',
    'selftext': '# Today\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n|9:00 PM ET|[Portland Trail Blazers](/r/ripcity)|&gt;!111!&lt; at &gt;!114!&lt;|[Golden State Warriors](/r/warriors)|FINAL|[(Link)](https://reddit.com/r/nba/comments/bpk3bc)||\n\n# Yesterday\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n|8:30 PM ET|[Toronto Raptors](/r/torontoraptors)|&gt;!100!&lt; at &gt;!108!&lt;|[Milwaukee Bucks](/r/mkebucks)|FINAL|||\n\n# Top Highlights:\n\n0. [Ryen Russillo: "Stephen A came out and said that people close to Jeanie Buss were saying you should trade LeBron and we found it was people from her spin class."](https://streamable.com/6sc4p) | [(Comments)](https://reddit.com/r/nba/comments/bphapo)\n\n0. [ESPN\'s Get Up host Mike Greenberg tries to get Jalen Rose to talk about the possibility of Zion not playing f

In [16]:
page_pull['data'].keys()

dict_keys(['modhash', 'dist', 'children', 'after', 'before'])

In [17]:
page_pull['data']['children']

[{'kind': 't3',
  'data': {'approved_at_utc': None,
   'subreddit': 'nba',
   'selftext': '# Today\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n|9:00 PM ET|[Portland Trail Blazers](/r/ripcity)|&gt;!111!&lt; at &gt;!114!&lt;|[Golden State Warriors](/r/warriors)|FINAL|[(Link)](https://reddit.com/r/nba/comments/bpk3bc)||\n\n# Yesterday\'s Games:\n\n|Tip-off|Away||Home||GDT|PGT|\n|:--|:--|:-:|:--|--:|:-:|:-:|\n|8:30 PM ET|[Toronto Raptors](/r/torontoraptors)|&gt;!100!&lt; at &gt;!108!&lt;|[Milwaukee Bucks](/r/mkebucks)|FINAL|||\n\n# Top Highlights:\n\n0. [Ryen Russillo: "Stephen A came out and said that people close to Jeanie Buss were saying you should trade LeBron and we found it was people from her spin class."](https://streamable.com/6sc4p) | [(Comments)](https://reddit.com/r/nba/comments/bphapo)\n\n0. [ESPN\'s Get Up host Mike Greenberg tries to get Jalen Rose to talk about the possibility of Zion not playing for NOLA and Jalen shuts it down: "Please kill

In [18]:
page_pull['data']['children'][1]

{'kind': 't3',
 'data': {'approved_at_utc': None,
  'subreddit': 'nba',
  'selftext': "Here is a place to have in depth, x's and o's, discussions on yesterday's games. Post-game discussions are linked in the table, keep your memes and reactions there.\n\n\n|Away|Home|Score|PGT|\n|:--|:--|:-:|:-:|\n|[](/POR) Portland Trail Blazers |[](/GSW) Golden State Warriors | 111 - 114 | ([Link](https://www.reddit.com/r/nba/comments/bpfi9x/flynn_patricks_1st_2019_nba_mock_draft/))|",
  'author_fullname': 't2_j12jv',
  'saved': False,
  'mod_reason_title': None,
  'gilded': 0,
  'clicked': False,
  'title': '[SERIOUS NEXT DAY THREAD] Post-Game Discussion (May 16, 2019)',
  'link_flair_richtext': [],
  'subreddit_name_prefixed': 'r/nba',
  'hidden': False,
  'pwls': 6,
  'link_flair_css_class': None,
  'downs': 0,
  'hide_score': True,
  'name': 't3_bprcjw',
  'quarantine': False,
  'link_flair_text_color': 'dark',
  'author_flair_background_color': '',
  'subreddit_type': 'public',
  'ups': 27,
  't

In [19]:
len(page_pull['data']['children'])

27

name, subreddit, selftext, title, num_comments, url, score

In [20]:
# When you are indexing deeply into json, it can help to make variable names for certain levels of indexing
# that you plan on reusing, to improve readability and make sure you don't make indexing errors as often

post_list = page_pull['data']['children']

In [21]:
post_list[1].keys()

dict_keys(['kind', 'data'])

In [22]:
for post in post_list:
    print(post['data']['name'])

t3_bpdou0
t3_bprcjw
t3_bpm94g
t3_bpngwa
t3_bpnf1h
t3_bpm93w
t3_bpma7v
t3_bpldf2
t3_bphapo
t3_bpml6c
t3_bpi4ci
t3_bpma16
t3_bpjucj
t3_bpmcwe
t3_bpn5y3
t3_bpmgl2
t3_bpmyom
t3_bpo1xl
t3_bpk99c
t3_bpeimm
t3_bpn7vc
t3_bpmchm
t3_bplfy2
t3_bpghmj
t3_bpm8c8
t3_bpmabg
t3_bpgpbh


In [23]:
post_list[0]['data']['title']

'Game Threads Index + Daily Discussion (May 16, 2019)'

### Scrape and build a dictionary to make a dataframe

In [24]:
# Sloppy way! Too much indexing in loop

post_dict = {}

for count, post in enumerate(post_list):
    post_dict[post_list[count]['data']['name']] = [post_list[count]['data']['title'], post_list[count]['data']['num_comments']]

In [25]:
# CLEAN WAY - using an indexer variable!!

post_dict = {}

for count, post in enumerate(post_list):
    post_indexer = post_list[count]['data']
    post_dict[post_indexer['name']] = [post_indexer['title'], post_indexer['num_comments']]

In [26]:
df = pd.DataFrame(post_dict).T
df.columns = ['title', 'num_comments']
df

Unnamed: 0,title,num_comments
t3_bpdou0,"Game Threads Index + Daily Discussion (May 16,...",100
t3_bprcjw,[SERIOUS NEXT DAY THREAD] Post-Game Discussion...,46
t3_bpm94g,[Post Game Thread] The Golden State Warriors (...,2834
t3_bpngwa,"Klay Thompson, asked about his front-row seat ...",186
t3_bpnf1h,"Reporter to Jordan Bell: ""As a young player, w...",158
t3_bpm93w,Iggy steal to seal Game 2 | ESPN,698
t3_bpma7v,Zach Collins in Game 2: 0/0/0/0/0 with 5 fouls...,274
t3_bpldf2,Seth pickpockets Steph,158
t3_bphapo,"Ryen Russillo: ""Stephen A came out and said th...",905
t3_bpml6c,Camera man chasing down Iggy like it’s the Mau...,57


## Put it in a function!

In [27]:
# function to scrape reddit page (takes a reddit .json url)
# returns posts 

headers = {'User-agent' : 'Nick'}

def scraper_bike(url):
    posts = []
    after = {}

    for page in range(40):
        params = {'after' : after}
        url = url
        pagepull = requests.get(url = url, params = params, headers = headers)
        page_dict = pagepull.json()
        posts.extend(page_dict['data']['children'])
        after = page_dict['data']['after']
        # sleep is a best practice (probably not necessary for such a small scrape)
        time.sleep(.2)
        
    return posts

In [28]:
nba_post_list = scraper_bike('https://www.reddit.com/r/nba.json')

In [29]:
len(nba_post_list)

980

In [30]:
# function to convert posts to DataFrame - won't allow duplicate posts since unique id 'name' is set as index
# Extract: name (as index) and subreddit, selftext, title (as columns)

def posts_to_df(post_list):
    post_dict = {}
    
    for i, post in enumerate(post_list):
        ind = post_list[i]['data']
        post_dict[ind['name']] = [ind['subreddit'], ind['title'], ind['selftext']]

    df_name = pd.DataFrame(post_dict)
    df_name = df_name.T
    df_name.columns = ['subreddit', 'title', 'selftext'] #'selftext'
    
    return df_name

In [31]:
posts_to_df(nba_post_list)

Unnamed: 0,subreddit,title,selftext
t3_bpdou0,nba,"Game Threads Index + Daily Discussion (May 16,...",# Today's Games:\n\n|Tip-off|Away||Home||GDT|P...
t3_bprcjw,nba,[SERIOUS NEXT DAY THREAD] Post-Game Discussion...,"Here is a place to have in depth, x's and o's,..."
t3_bpm94g,nba,[Post Game Thread] The Golden State Warriors (...,**[](/POR) POR**|**Min**|**FG**|**FT**|**3PT**...
t3_bpngwa,nba,"Klay Thompson, asked about his front-row seat ...",[Via The Athletic](https://twitter.com/TheAthl...
t3_bpnf1h,nba,"Reporter to Jordan Bell: ""As a young player, w...","&gt;Reporter to Jordan Bell: ""As a young playe..."
t3_bpm93w,nba,Iggy steal to seal Game 2 | ESPN,
t3_bpma7v,nba,Zach Collins in Game 2: 0/0/0/0/0 with 5 fouls...,"Granted he's just a sophomore, but Zach Collin..."
t3_bpldf2,nba,Seth pickpockets Steph,
t3_bphapo,nba,"Ryen Russillo: ""Stephen A came out and said th...",
t3_bpml6c,nba,Camera man chasing down Iggy like it’s the Mau...,


## Couple extra functions for simplicity in running

In [32]:
# takes scraper function and url - outputs dataframe

def scrape_to_df(scrape_func, url):
    
    return posts_to_df(scrape_func(url))

### Function to scrape and save to csv. HIGHLY recommended when gathering data online that you want to ensure you maintain a copy of locally (and remotely if you want to be secure)

In [33]:
# NOTE: YOU NEED A CSV ALREADY MADE TO SAVE TO IN THIS CASE. 
# YOU COULD ADD CODE TO CREATE A NEW CSV IF NONE EXISTS

# scrape, import csv, concat, drop duplicate, and output to csv

# takes in scraper function, url, csv filename to import, csv filename to output

# Outputs - Concatenated DataFrame as csv

def scrape_add(scrape_func, url, import_file, export_file):
    
    scrape_df = posts_to_df(scrape_func(url))
    
    imported_df = pd.read_csv(import_file, index_col = 'Unnamed: 0')
    
    concat_df = pd.concat([imported_df, scrape_df])
    
    concat_df = concat_df[~concat_df.index.duplicated(keep='first')]
    
    concat_df.to_csv(export_file)