# Compiling Reddit API Data

In [1]:
# Imports
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json


## Getting Access to Reddit API

We will be collecting all of our data from Reddit API. Prior to this step, an application must be created under your account at [Reddit.com](https://www.reddit.com).

In [182]:
client_id = 'wIQlH8DmARgNasJbnwwkIA'  # Provided under "personal use script"
client_secret = 'HTY0NzUY49CU8GOBdCT4Ln57aNDiMQ' # Provided as "secret"
user_agent = 'GA Project' # Name of your application
username = 'username123' # Your reddit username (placeholder for privacy)
password = 'password123' # Your reddit password (placeholder for privacy))

In [183]:
auth = requests.auth.HTTPBasicAuth(client_id, client_secret)

data = {
    'grant_type': 'password',
    'username': username,
    'password': password
}

headers = {'User-Agent': 'namehere/0.0.1'}

We then need to create a request for an access token.

In [None]:
res = requests.post(
    'https://www.reddit.com/api/v1/access_token',
    auth=auth,
    data=data,
    headers=headers)

If our code is working we should get a response code of 200

In [None]:
print(res)

In [185]:
#retrieve access token
token = res.json()['access_token']

In [186]:
headers['Authorization'] = f'bearer {token}'

requests.get('https://oauth.reddit.com/api/v1/me', headers=headers).status_code == 200

True

## r/GenZ Data

For the first half of our data we will be accessing [r/Genz](https://www.reddit.com/r/GenZ/).

In [187]:
base_url = 'https://oauth.reddit.com/r/'
subreddit = 'genz'

We can only access 100 at a time.

In [188]:
params = {
    'limit': 100,
}

We need a new request, this time it will be for the data of the posts made in the r/GenZ subreddit.

In [189]:
res = requests.get(base_url+subreddit,
                   headers=headers,
                  params=params)

In [None]:
# Check that we can access the data we need for our loop
print(res.json()['data']['after'])
print('='*50)
print(res.json()['data']['children'][0]['data']['title'])
print('='*50)
print(res.json()['data']['children'][0]['data']['selftext'])
print('='*50)
print(res.json()['data']['children'][0]['data']['subreddit_name_prefixed'])

In [252]:
params = {
    'limit': 100
}

genz_posts = [] # list where we will add a dictionaries for each post

while len(genz_posts) <= 950: # Can only access the last 1000 posts, don't get 100 more posts if len is over 900 because it will end in duplicates
    res = requests.get(base_url+subreddit,
                   headers=headers,
                  params=params)
    
    for i in range(len(res.json()['data']['children'])):
        # storing data
        title = res.json()['data']['children'][i]['data']['title']
        selftext = res.json()['data']['children'][i]['data']['selftext']
        subreddit_name = res.json()['data']['children'][i]['data']['subreddit_name_prefixed']

        # add to dict
        genz_post_dict = {# create dictionary for each item, defined by the variables above
        'title': title,
        'selftext': selftext,
        'subreddit': subreddit_name
        }

        # add dict to posts list
        genz_posts.append(genz_post_dict)
    print(len(genz_posts))
    params['after'] = res.json()['data']['after'] # will add the 'after' key from the last data to params

We can turn this into a Data Frame so it is easier to work with.

In [254]:
genz_posts_df = pd.DataFrame(genz_posts)

In [255]:
len(genz_posts_df['title'].unique())

956

In [300]:
genz_posts_df.head()

Unnamed: 0,title,selftext,subreddit
0,What have you learned this week?,,r/GenZ
1,r/GenZ Is in need of more moderators! Read below:,"In the past few months, we've been getting inc...",r/GenZ
2,Who else remembers Net Neutrality and when thi...,,r/GenZ
3,This sub lately.,,r/GenZ
4,You guys are absolutely pathetic,Where do I even start? It's just constant comp...,r/GenZ


## r/Millennials Data

The second half of our data comes from [r/Millennials](https://www.reddit.com/r/Millennials/). We'll follow the same steps as above to retrieve this data.

In [285]:
base_url = 'https://oauth.reddit.com/r/'
subreddit = 'millennials'

In [286]:
params = {
    'limit': 100,
}

In [287]:
res = requests.get(base_url+subreddit,
                   headers=headers,
                  params=params)

In [288]:
# Check that we can access the data we need for our loop
print(res.json()['data']['after'])
print('='*50)
print(res.json()['data']['children'][0]['data']['title'])
print('='*50)
print(res.json()['data']['children'][0]['data']['selftext'])
print('='*50)
print(res.json()['data']['children'][0]['data']['subreddit_name_prefixed'])

t3_18xqlqu
r/Millennials Weekly Rant/Politics Thread
Please use this weekly thread to vent and let loose about personal rants. Got something upsetting or overwhelming that you just need to vent or shout out to the world? You can post those thoughts here. There are many real problems that plague the Millennial generation and we want to allow a space for it here while still keeping the angry and divisive posts to a more concentrated thread rather than taking up the entire front page.

Also while we generally remove political posts, we do allow general discussions of politics here so long as you remain civil and don't attack someone just for having a different opinion. The moment we see things start to derail, we will step in.
r/Millennials


In [289]:
params = {
    'limit': 100
}

mill_posts = [] # list where we will add a dictionaries for each post

while len(mill_posts) <= 950: # Can only access the last 1000 posts, don't get 100 more posts if len is over 900 because it will end in duplicates
    res = requests.get(base_url+subreddit,
                   headers=headers,
                  params=params)
    
    for i in range(len(res.json()['data']['children'])):
        # storing data
        title = res.json()['data']['children'][i]['data']['title']
        selftext = res.json()['data']['children'][i]['data']['selftext']
        subreddit_name = res.json()['data']['children'][i]['data']['subreddit_name_prefixed']

        # add to dict
        mill_post_dict = {# create dictionary for each item, defined by the variables above
        'title': title,
        'selftext': selftext,
        'subreddit': subreddit_name
        }

        # add dict to posts list
        mill_posts.append(mill_post_dict)
    print(len(mill_posts))
    params['after'] = res.json()['data']['after'] # will add the 'after' key from the last data to params

102
202
302
402
502
602
702
802
902
961


Now that we have the data from r/Millennials, we need to turn it into a dataframe so we can combine it with r/GenZ and store it.

In [290]:
mill_posts_df = pd.DataFrame(mill_posts)

In [291]:
len(mill_posts_df['title'].unique())

960

In [293]:
mill_posts_df.head()

Unnamed: 0,title,selftext,subreddit
0,r/Millennials Weekly Rant/Politics Thread,Please use this weekly thread to vent and let ...,r/Millennials
1,"For those who have free time, We are currently...",,r/Millennials
2,Amy Lee of the band Evanescence going to schoo...,,r/Millennials
3,I can't stress this enough...talk to your pare...,[https://www.wbur.org/hereandnow/2024/01/04/am...,r/Millennials
4,New Middle Age For Males in US is 36 Years Old...,Something on the Internet made me look this up...,r/Millennials


## Combining into one DataFrame and Exporting

### Combine into one DataFrame

In [294]:
all_posts = pd.concat([genz_posts_df, mill_posts_df], axis=0)

In [295]:
# Should have all gen z at the top
all_posts.head()

Unnamed: 0,title,selftext,subreddit
0,What have you learned this week?,,r/GenZ
1,r/GenZ Is in need of more moderators! Read below:,"In the past few months, we've been getting inc...",r/GenZ
2,Who else remembers Net Neutrality and when thi...,,r/GenZ
3,This sub lately.,,r/GenZ
4,You guys are absolutely pathetic,Where do I even start? It's just constant comp...,r/GenZ


In [None]:
# Should have all millennial posts at the bottom
all_posts.tail()

In [299]:
# rows in all_posts should equal rows in gen z df plus rows in millenials df
print(genz_posts_df.shape)
print(mill_posts_df.shape)
print(all_posts.shape)

(965, 3)
(961, 3)
(1926, 3)


### Export to .csv

Let's store this as a .csv file. That way we can read it into other notebooks for our models.

In [301]:
all_posts.to_csv('../data/combined_genz_mill_posts.csv', index = False)

Just to be safe, let's read our data back in to make sure it stored properly.

In [2]:
test = pd.read_csv('../data/combined_genz_mill_posts.csv')

In [3]:
print(test.shape)
test.head()

(1926, 3)


Unnamed: 0,title,selftext,subreddit
0,What have you learned this week?,,r/GenZ
1,r/GenZ Is in need of more moderators! Read below:,"In the past few months, we've been getting inc...",r/GenZ
2,Who else remembers Net Neutrality and when thi...,,r/GenZ
3,This sub lately.,,r/GenZ
4,You guys are absolutely pathetic,Where do I even start? It's just constant comp...,r/GenZ


In [4]:
test.tail()

Unnamed: 0,title,selftext,subreddit
1921,🏴‍☠️ Anyone miss the golden age of internet pi...,,r/Millennials
1922,Who else used to call time and temp as childre...,I remember my siblings and I would call it non...,r/Millennials
1923,Millennial Aunts and Uncles: How many of you f...,I’m a 43 year old elder Millennial and proud u...,r/Millennials
1924,All the nostalgia music for our generation is ...,I kind of understand the when we were young th...,r/Millennials
1925,“Our Employees are Family” Meme. First-hand ex...,[Article the meme is from](https://open.substa...,r/Millennials
