# Getting Reddit Data

There are two options for extracting data from Reddit:

* The `requests` library, which will allow us to interface directly with the Reddit API.

* The PRAW library, which is a wrapper library that adds an extra layer of abstraction in accessing the Reddit API.

Here we will cover the first option, using the `requests` library to interface directly with the API.

The final extraction script will look like this:

In [1]:
import requests

In [22]:
def reading(file_name = 'credentials.txt'):
    s = open(file_name, 'r').read()
    dict = eval(s)
    return(dict)

In [23]:
dict = reading()

#### Commented code is from the lecture, but results in `'invalid_grant'` error

In [61]:
auth = requests.auth.HTTPBasicAuth(dict['client_id'], dict['secret'])

data= {'grant_type': 'password',
        'username': dict['user'],
        'password': dict['pwd']}

headers = {'User-Agent': 'MyBot/0.0.1'}

res = requests.post(f'https://www.reddit.com/api/v1/access_token',
                            auth=auth, data=data, headers=headers)

res

# Output: <Response [200]>

res.json()

# Output: {'error': 'invalid_grant'}

token = res.json()['access_token']

headers['Authorization'] = f'bearer {token}'

In [62]:
headers

{'User-Agent': 'MyBot/0.0.1',
 'Authorization': 'bearer 2371907889939-UprBkAxdLUCLN7a3b08jnl6UrhqTug'}

In [63]:
requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)

<Response [200]>

In [64]:
api = 'https://oauth.reddit.com'

In [80]:
res = requests.get(f'{api}/r/hawwkey/new', headers=headers)

In [103]:
# Commented to reduce jupyter notebook size
# res.json()

In [82]:
res = requests.get(f'{api}/r/hawwkey/new', headers=headers, params={'limit': '100'})

In [104]:
# Commented to reduce jupyter notebook size
# res.json()

name, created_utc, subreddit, title, selftext, upvote_ratio, ups, downs, score

In [87]:
import pandas as pd

df = pd.DataFrame({
    'name': [], 
    'created_utc': [], 
    'subreddit': [], 
    'title': [], 
    'selftext': [], 
    'upvote_ratio': [], 
    'ups': [], 
    'downs': [], 
    'score': []
})

In [89]:
for posts in res.json()['data']['children']:
    df = df.append({
        'name': posts['data']['name'], 
        'created_utc': posts['data']['created_utc'], 
        'subreddit': posts['data']['subreddit'], 
        'title': posts['data']['title'], 
        'selftext': posts['data']['selftext'], 
        'upvote_ratio': posts['data']['upvote_ratio'], 
        'ups': posts['data']['ups'], 
        'downs': posts['data']['downs'], 
        'score': posts['data']['score']
}, ignore_index=True)

In [97]:
df.head()

Unnamed: 0,name,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score
0,t3_yua5fs,1668365000.0,hawwkey,Sabres welcome fans from Roswell Park Comprehe...,,1.0,17.0,0.0,17.0
1,t3_ys0byl,1668138000.0,hawwkey,Julien Gauthiers dad does a little dance after...,,0.81,3.0,0.0,3.0
2,t3_yn8i0r,1667687000.0,hawwkey,Tarasenko absolutely leveled by 9 year old :),,0.88,98.0,0.0,98.0
3,t3_ymighg,1667617000.0,hawwkey,Local Finland Kids Participate in Avalanche-Bl...,,1.0,10.0,0.0,10.0
4,t3_ymhthh,1667616000.0,hawwkey,some of the hurricanes dancing along with the ...,,0.98,257.0,0.0,257.0


In [91]:
df['name'].iloc[len(df)-1]

't3_qwulcj'

In [94]:
while True:
    res = requests.get(f'{api}/r/hawwkey/new', headers=headers, 
                       params={'limit': '100',
                               'after': df['name'].iloc[len(df)-1]})
    if (len(res.json()['data']['children'])==0):
        break
    for posts in res.json()['data']['children']:
        df = df.append({
            'name': posts['data']['name'], 
            'created_utc': posts['data']['created_utc'], 
            'subreddit': posts['data']['subreddit'], 
            'title': posts['data']['title'], 
            'selftext': posts['data']['selftext'], 
            'upvote_ratio': posts['data']['upvote_ratio'], 
            'ups': posts['data']['ups'], 
            'downs': posts['data']['downs'], 
            'score': posts['data']['score']
        }, ignore_index=True)

In [95]:
df.tail()

Unnamed: 0,name,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score
994,t3_5mixso,1483769000.0,hawwkey,Nikita Tryampkin helping Brandon Sutter to the...,,0.98,70.0,0.0,70.0
995,t3_5mbgs1,1483676000.0,hawwkey,Colton Sissons giggling giddily after scoring ...,,0.98,52.0,0.0,52.0
996,t3_5m91hk,1483650000.0,hawwkey,Matthew Ford pranks kid on bench,,0.96,97.0,0.0,97.0
997,t3_5lxwqu,1483517000.0,hawwkey,Foligno &amp; Bobs hugging out the 16th straig...,,0.99,138.0,0.0,138.0
998,t3_5llzsw,1483374000.0,hawwkey,Tarasenko and his son,,0.95,120.0,0.0,120.0


In [99]:
df = df.replace({'|': ''}, regex=True)

In [106]:
df.to_csv('./data/reddit_hawwkey.csv', sep='|', index=False)

In [None]:
res.json()['data']['children'][0]['data']

In [86]:
res.json()['data']['children'][0]['data']

{'approved_at_utc': None,
 'subreddit': 'hawwkey',
 'selftext': '',
 'author_fullname': 't2_flz5r',
 'saved': False,
 'mod_reason_title': None,
 'gilded': 0,
 'clicked': False,
 'title': 'Sabres welcome fans from Roswell Park Comprehensive Cancer Center on Hockey Fights Cancer Night',
 'link_flair_richtext': [],
 'subreddit_name_prefixed': 'r/hawwkey',
 'hidden': False,
 'pwls': 6,
 'link_flair_css_class': None,
 'downs': 0,
 'top_awarded_type': None,
 'hide_score': True,
 'name': 't3_yua5fs',
 'quarantine': False,
 'link_flair_text_color': 'dark',
 'upvote_ratio': 1.0,
 'author_flair_background_color': None,
 'subreddit_type': 'public',
 'ups': 17,
 'total_awards_received': 0,
 'media_embed': {},
 'author_flair_template_id': None,
 'is_original_content': False,
 'user_reports': [],
 'secure_media': None,
 'is_reddit_media_domain': False,
 'is_meta': False,
 'category': None,
 'secure_media_embed': {},
 'link_flair_text': None,
 'can_mod_post': False,
 'score': 17,
 'approved_by': None

#### Using the following class, however, works just fine:

In [32]:
import requests
import pandas as pd


class Reddit:
    def __init__(self, client_id, secret_token, username, password):
        # first create authentication object
        auth = requests.auth.HTTPBasicAuth(client_id, secret_token)
        # build login dictionary
        login = {'grant_type': 'password',
                 'username': username,
                 'password': password}
        # setup header info (incl description of API)
        headers = {'User-Agent': 'MyBot/0.0.1'}
        # send request for OAuth token
        res = requests.post(f'https://www.reddit.com/api/v1/access_token',
                            auth=auth, data=login, headers=headers)
        # pull auth bearer token from response
        token = res.json()['access_token']
        # add authorization to headers dictionary
        headers['Authorization'] = f'bearer {token}'
        # add headers dict to internal attributes
        self.headers = headers
        # and api
        self.api = 'https://oauth.reddit.com'

    def get_new(self, subreddit, iters):
        # initialize dataframe to store data
        df = pd.DataFrame()
        # initialize parameters dictionary
        params = {'limit': 100}
        # iterate through several times to make sure we get all the data available
        for i in range(iters):
            # make request
            res = requests.get(f'{self.api}/r/{subreddit}/new',
                               headers=self.headers,
                               params=params)
            # check that we returned something (if not we reached end)
            if len(res.json()['data']['children']) == 0:
                print('No more found')
                return df
            # iterate through each thread recieved
            for thread in res.json()['data']['children']:
                # add info to dataframe
                df = df.append({
                    'id': thread['data']['name'],
                    'created_utc': int(thread['data']['created_utc']),
                    'subreddit': thread['data']['subreddit'],
                    'title': thread['data']['title'],
                    'selftext': thread['data']['selftext'],
                    'upvote_ratio': thread['data']['upvote_ratio'],
                    'ups': thread['data']['ups'],
                    'downs': thread['data']['downs'],
                    'score': thread['data']['score']
                }, ignore_index=True)
            # get earliest ID
            earliest = df['id'].iloc[len(df)-1]
            # add earliest ID to params
            params['after'] = earliest
        return df

In [47]:
SUB = 'hawwkey'

In [48]:
CLIENT_ID = dict['client_id']
SECRET_TOKEN = dict['secret']

In [49]:
USER = dict['user']
PWD = dict['pwd']

In [50]:
reddit = Reddit(CLIENT_ID, SECRET_TOKEN, USER, PWD)

In [51]:
data = reddit.get_new(SUB, 20)

No more found


In [52]:
data = data.replace({'|': ''}, regex=True)

In [53]:
data.to_csv(f'./data/reddit_{SUB}.csv', sep='|', index=False)

In [54]:
data.head()

Unnamed: 0,id,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score
0,t3_yua5fs,1668365000.0,hawwkey,Sabres welcome fans from Roswell Park Comprehe...,,1.0,17.0,0.0,17.0
1,t3_ys0byl,1668138000.0,hawwkey,Julien Gauthiers dad does a little dance after...,,0.81,3.0,0.0,3.0
2,t3_yn8i0r,1667687000.0,hawwkey,Tarasenko absolutely leveled by 9 year old :),,0.88,95.0,0.0,95.0
3,t3_ymighg,1667617000.0,hawwkey,Local Finland Kids Participate in Avalanche-Bl...,,1.0,10.0,0.0,10.0
4,t3_ymhthh,1667616000.0,hawwkey,some of the hurricanes dancing along with the ...,,0.98,261.0,0.0,261.0
