In [None]:
# NOTE: Make sure info.txt isn't uploaded to Github.
# CSV files are too big to upload to Github. Send through Google Drive. Just upload .ipynb code to Github.

import pandas as pd
import schedule
import time
import numpy as np
import requests
import datetime
import warnings
from pathlib import Path

warnings.filterwarnings("ignore")

In [None]:
with open('info.txt', 'r') as f:
    info = f.readlines()
        
    # Public key
    client_id = info[0].rstrip('\n')

    # Private key
    secret_key = info[1].rstrip('\n')
    
    auth = requests.auth.HTTPBasicAuth(client_id, secret_key)
        
    data = {
    'grant_type': 'password',
    'username': info[2].rstrip('\n'),
    'password': info[3]}
    
    headers = {'User-Agent': 'MyAPI/0.0.1'}
    
    res = requests.post('https://www.reddit.com/api/v1/access_token',
                   auth=auth, data=data, headers=headers)
    
    token = res.json()['access_token']
    
    headers = {**headers, **{'Authorization': f'bearer {token}'}}

In [None]:
def fetch_reddit_data(subreddit, post_or_comment, csv_file):
    link = 'https://oauth.reddit.com/r/' + subreddit + '/' + post_or_comment
    res = requests.get(link, headers=headers,
                  params={'limit': '100'})
    
    # Data in json form.
    newest = res.json()
    
    newest_df = pd.DataFrame()
    
    # Grabs data on posts.
    for post in newest['data']['children']:
        if (post_or_comment == 'new') or (post_or_comment == 'hot'):
            newest_df = newest_df.append({
                'subreddit': post['data']['subreddit'],
                'title': post['data']['title'],
                'selftext': post['data']['selftext'],
                'upvote_ratio': post['data']['upvote_ratio'],
                'ups': post['data']['ups'],
                'downs': post['data']['downs'],
                'score': post['data']['score'],
                'author': post['data']['author'],
                'created_utc': datetime.datetime.fromtimestamp(post['data']['created_utc'])
            }, ignore_index=True)
        # Grabs data on comments.
        elif (post_or_comment == 'comments'):
            newest_df = newest_df.append({
                'subreddit': post['data']['subreddit'],
                'post_title': post['data']['link_title'],
                'ups': post['data']['ups'],
                'downs': post['data']['downs'],
                'score': post['data']['score'],
                'post_author': post['data']['link_author'],
                'date': datetime.datetime.fromtimestamp(post['data']['created_utc']),
                'num_comments': post['data']['num_comments'],
                'comment_author': post['data']['author'],
                'comment_text': post['data']['body']
            }, ignore_index=True)

    path = Path(csv_file)
    
    # If file is created already, append data and remove duplicates.
    if path.is_file():
        newest_df.to_csv(csv_file, mode='a', index=False, header=False)
        df = pd.read_csv(csv_file)
        df = df.drop_duplicates()
        df.to_csv(csv_file, index=False)
    # If file is not created, create it with header.
    else:
        newest_df.to_csv(csv_file, index=False)
    

In [None]:
def get_all_data():
    # Antifeminism subreddit
    fetch_reddit_data('Antifeminists', 'new', 'antifeminism_posts.csv')
    fetch_reddit_data('Antifeminists', 'hot', 'antifeminism_hot.csv')
    fetch_reddit_data('Antifeminists', 'comments', 'antifeminism_comments.csv')
    
    # Askfeminism subreddit
    fetch_reddit_data('Askfeminists', 'new', 'askfeminism_posts.csv')
    fetch_reddit_data('Askfeminists', 'hot', 'askfeminism_hot.csv')
    fetch_reddit_data('Askfeminists', 'comments', 'askfeminism_comments.csv')
    
    # Feminism subreddit
    fetch_reddit_data('Feminism', 'new', 'feminism_posts.csv')
    fetch_reddit_data('Feminism', 'hot', 'feminism_hot.csv')
    fetch_reddit_data('Feminism', 'comments', 'feminism_comments.csv')

In [None]:
def merge_posts_comments(post_file, comments_file, csv_file):
    posts_df = pd.read_csv(post_file)
    comments_df = pd.read_csv(comments_file)
    
    merged_df = pd.merge(
        posts_df,
        comments_df,
        how="outer",
        left_on=['subreddit', 'title', 'author'],
        right_on=['subreddit', 'post_title', 'post_author'],
        indicator=True
    ).rename(columns={'selftext': 'post_selftext', 
                      'upvote_ratio': 'post_upvote_ratio', 
                      'ups_x': 'post_ups', 
                      'downs_x': 'post_downs', 
                      'score_x': 'post_score', 
                      'created_utc': 'post_date', 
                      'ups_y': 'comment_ups', 
                      'downs_y': 'comment_downs', 
                      'score_y': 'comment_score', 
                      'date': 'comment_date', 
                      'num_comments': 'post_num_comments'}).drop(['title', 'author'], 
                                                                 axis=1)

    reorganized_df = merged_df[['subreddit', 
                                'post_title',
                                'post_selftext',
                                'post_author',
                                'post_date',
                                'post_num_comments',
                                'post_upvote_ratio',
                                'post_ups',
                                'post_downs',
                                'post_score',
                                'comment_text',
                                'comment_author',
                                'comment_date',
                                'comment_ups',
                                'comment_downs',
                                'comment_score',
                                '_merge']]

    reorganized_df.to_csv(csv_file, index=False)



In [None]:
def merging():
    merge_posts_comments('antifeminism_posts.csv', 'antifeminism_comments.csv', 'antifeminism_posts_comments.csv')
    merge_posts_comments('antifeminism_hot.csv', 'antifeminism_comments.csv', 'antifeminism_hot_comments.csv')
    
    merge_posts_comments('askfeminism_posts.csv', 'askfeminism_comments.csv', 'askfeminism_posts_comments.csv')
    merge_posts_comments('askfeminism_hot.csv', 'askfeminism_comments.csv', 'askfeminism_hot_comments.csv')
    
    merge_posts_comments('feminism_posts.csv', 'feminism_comments.csv', 'feminism_posts_comments.csv')
    merge_posts_comments('feminism_hot.csv', 'feminism_comments.csv', 'feminism_hot_comments.csv')


In [None]:
schedule.every(1).hours.do(get_all_data)
schedule.every(1).days.do(merging)

# Comment to stop schedule
while True:
    schedule.run_pending()
    # schedule.cancel_job(get_all_data)
    time.sleep(5)

# Uncomment to stop schedule
# schedule.cancel_job(get_all_data)

## Create a function that takes in a csv_file and does NLP analysis on post_title, post_selftext, and comment_text. Run function for all 6 csv files ending in either hot_comments or posts_comments.