In [1]:
# Harris and Walz Notebook
# Import libraries
import pandas as pd
import requests
import json


In [None]:


with open('params.json') as param_json:
    params = json.load(param_json)

CLIENT_ID = params['CLIENT_ID']
SECRET_TOKEN = params['SECRET_TOKEN']
USERNAME = params['USERNAME']
PASSWORD = params['PASSWORD']
KEYWORDS = params['KEYWORDS']
keywords = [word.lower() for word in KEYWORDS]

In [4]:

# note that CLIENT_ID refers to 'personal use script' and SECRET_TOKEN to 'token'
auth = requests.auth.HTTPBasicAuth(CLIENT_ID, SECRET_TOKEN)

# here we pass our login method (password), username, and password
data = {'grant_type': 'password',
        'username': USERNAME,
        'password': PASSWORD}

# setup our header info, which gives reddit a brief description of our app
headers = {'User-Agent': 'MyBot/0.0.1'}

# send our request for an OAuth token
res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=auth, data=data, headers=headers)

# convert response to JSON and pull access_token value
TOKEN = res.json()['access_token']

# add authorization to our headers dictionary
headers = {**headers, **{'Authorization': f"bearer {TOKEN}"}}

# while the token is valid (~2 hours) we just add headers=headers to our requests
requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)


<Response [200]>

In [13]:

def get_post(subreddit, cats, concated_df):
    '''
    Gets post from passed subreddits. 
    params: 
        subreddit - the subreddit to pull post from 
        cats - 'hot' or 'rising' which are filters on subreddits to find trending post 
        concated_df - dataframe from yesterday which contains post from previous day. 
    '''

    res = requests.get(f"https://oauth.reddit.com/r/{subreddit}/{cats}",
                       headers=headers)
    data = res.json()
    #print(data)
    posts = data['data']['children']
    posts_data = []

    for post in posts:
        post_info = post['data']
        posts_data.append({
            'title': post_info['title'],
            'upvote_ratio': post_info['upvote_ratio'],
            'subreddit_name_prefixed': post_info['subreddit_name_prefixed'],
            'date': post_info['created_utc']

        })

        # Create a DataFrame
    df = pd.DataFrame(posts_data)

    df['date'] = pd.to_datetime(df['date'], unit='s').dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
    df['title'] = df['title'].apply(lambda x: x.lower())
    df = df[df['title'].apply(lambda word: any(keyword in word for keyword in keywords))]

    newDF = pd.concat([df, concated_df], ignore_index=True)

    return newDF


In [14]:

def subreddits():
    '''
    Runs the "get_post" function on all of our subreddits. Outputs a new csv called 'titles'. 
    Titles contains: 
        - post title 
        - subreddit name
        - ratio of upvotes to downvotes
        - date published 
    :return: 
    '''
    with open('output/titles.csv') as maindf:
        main_df = pd.read_csv(maindf)

    df = get_post("politics", "hot", main_df)
    df = get_post("democrats", "rising", df)
    df = get_post("politicaldiscussion", "rising", df)
    df = get_post("politicaldiscussion", "hot", df)
    df = get_post("moderatepolitics", "rising", df)
    df = get_post("moderatepolitics", "hot", df)
    df = get_post("democrats", "hot", df)
    df = get_post("politics", "rising", df)
    df = get_post("politics", "hot", df)
    df = df[['title', 'upvote_ratio', 'subreddit_name_prefixed', 'date']]
    df = df.drop_duplicates()
    df.to_csv('output/titles.csv')


In [15]:
subreddits()

In [16]:
df = pd.read_csv('output/titles.csv')
print(len(df))
df = df.drop_duplicates()
print(len(df))
df.to_csv('output/titles.csv')

167
123
