In [1]:
## Import libaries
import pandas as pd
import numpy as np
import requests
import json
import time
import os
from tqdm import tqdm
from IPython.display import clear_output
from bs4 import BeautifulSoup
pd.set_option('display.max_colwidth', -1)

In [2]:
#### Function to pull a maximum of ~2,500 reddit posts and all associated comments
def reddit_pull(url, max_pull_size):
    
    ## Setting global variables so they can be used in later functions
    global headers
    global errors
    global comments_list
    
    ## Establishing login creds and empty lists
    errors = []
    posts_list = []
    comments_list = []
    
    ## Setting after to random string so that loop does not break
    after = 'randomstring'
    
    ## Continuing to loop until there are as many posts as the defined/passed max pull size
    while len(posts_list) < max_pull_size:
        print('Pulling post data...\n') ## Sign posting progress
        if after == None: ## Breaks loop when last post is reached
            break
            
        ## After indicates to reddit API where I want to start pulling after
        res = requests.get(url, params = {'after' : after}, headers={'User-agent' : 'jimtronic'})
        json_pull = res.json() ## Storing pull as JSON file
        print('Done pulling post data.\n') ## Sign posting progress
        posts_list.extend(json_pull['data']['children']) ## Adding ALL of the JSON pull to my dataset
        after = json_pull['data']['after'] ## Setting after to indicate where to start next json pull
        time.sleep(1) ## Pausing so API doesn't throttle me
        
        append_comms(json_pull) ## Calling function to pull and append data from each comment page
        
        ## Sign posting progress
        clear_output()
        print(f"Total Posts Pulled: {len(posts_list)}\nTotal Comments Pulled: {len(comments_list)}" +\
    f"\nComment Page Errors: {errors}\n")
    
    ## Converts complex list of dicts into simple list of dict for easy CSV export
    clean_posts_list = data_structure_posts(posts_list)
    
    ## Returning list of dicts for both posts and comments
    return clean_posts_list, comments_list ###### EDIT EDIT EDIT

###### Would like to add function to re-run errors ######

In [3]:
#### Function to pull comments for each reddit post
def append_comms(json_pull):
    print('Pulling comments data...')
    
    ## Create a list of all the posts from the recent/passed JSON pull
    json_list = json_pull['data']['children']
    for i, json_item in enumerate(json_list):
        print(f'Pulling comments for post... {i+1} of {len(json_list)}')
        
        ## Storing the post's comment page url as "com_url"
        com_url = 'https://www.reddit.com/' + json_item['data']['permalink'][:-1] + ".html"
        com_res = requests.get(com_url, headers={'User-agent' : 'jimtronic'}) ## Getting data on comment page
        com_soup = BeautifulSoup(com_res.content, 'html') ## Bringing data into Beautiful Soup
        
        ## Capturing the entire comments section
        comment_section = com_soup.find('div', {'class':'p0SYO8TbZVqJIWEeFcNZx i6gx00-2 hNwzqg'})
        
        ## Try to find all the individual comment blocks
        try:
            comment_section.find_all('div', {'class' : 'Comment'})
            
        ## If comment blocks not found (None type), there's an error with that page
        except:
            print('Page error: ' + str(com_url))
            errors.append(com_url)
            break ### ERRORS STORED -- not sure why they're happening... throttling?
        
        ## Iterating through each comment on the comments page
        for com in comment_section.find_all('div', {'class' : 'Comment'}):
            try: ## Try adding the user name and comment text
                user = com.find('div', {'class' : 'xvda30-0'}).text
                text = com.find('div', {'data-test-id': 'comment'}).text
            except: ## If it can't, it's b/c the comment was removed
                user = 'Removed_comment'
                text = 'Deleted comment'
            try: ## Try to find the comment score
                points = float(com.find('span', {'class' : 'h5svje-0 cFQOcm'}).text.strip(' points'))
            except: ## If it can't, it's b/c the score was hidden
                points = "Score hidden"
                
            ## Append the dictionary to the end of the comments list of dicts
            comments_list.append({
                'reply_to': com_url, ## So I know which post responded to
                'user': user,
                'text': text,
                'points': points,
                ## Indicates how many replies deep the reply was
                'level': float(com.find('span', {'class': 's1dqr9jy-0 imyGpC'}).text.strip('level '))})

###### Would like to add something to convert '6 hours ago' to a date-time feature... ######
###### Would like to use the API the whole time... why am I switching to web scraping? ######

In [4]:
#### Takes entire API pull of post data and returns organized structure of useful data
def data_structure_posts(posts_list):
    clean_posts_list = [] ## Where useful data will be stored
    for post in posts_list: ## Loops through a list of posts
        clean_posts_list.append({
            'likes' : post['data']['likes'],
            'up_votes' : post['data']['ups'],
            'down_votes' : post['data']['downs'],
            'title' : post['data']['title'],
            'text' : post['data']['selftext'],
            'author' : post['data']['author'],
            'num_crossposts' : post['data']['num_crossposts'],
            'is_crosspostable' : post['data']['is_crosspostable'],
            'num_comments' : post['data']['num_comments'],
            'score' : post['data']['score'],
            'permalink' : 'https://www.reddit.com/' + str(post['data']['permalink']), ## To reference comments 
            'name' : post['data']['name'],
            'url' : post['data']['url'],
            'media' : post['data']['media'],
            'num_reports' : post['data']['num_reports']
        })
    return clean_posts_list

In [5]:
#### Function to remove duplicates...

In [6]:
url_conservative = 'https://www.reddit.com/r/conservative.json'
posts_conservative, comments_conservative = reddit_pull(url_conservative, max_pull_size = 2000)

Total Posts Pulled: 844
Total Comments Pulled: 5062
Comment Page Errors: ['https://www.reddit.com//r/Conservative/comments/b7ww6s/please_post_articles_on_rconservativearticles_and.html', 'https://www.reddit.com//r/Conservative/comments/b7e5mb/university_of_california_uses_trumps_executive.html', 'https://www.reddit.com//r/Conservative/comments/b64h41/republicans_accused_of_colluding_with_reality_to.html']

Pulling post data...



In [7]:
url_liberal = 'https://www.reddit.com/r/liberal.json'
posts_liberal, comments_liberal = reddit_pull(url_liberal, max_pull_size = 2000)

Total Posts Pulled: 563
Total Comments Pulled: 4483
Comment Page Errors: ['https://www.reddit.com//r/Liberal/comments/aalzy7/maine_gop_governor_paul_lepage_writes_stolen.html']

Pulling post data...



In [8]:
df_lib_posts = pd.DataFrame(posts_liberal)
df_lib_comments = pd.DataFrame(comments_liberal)
df_cons_posts = pd.DataFrame(posts_conservative)
df_cons_comments = pd.DataFrame(comments_conservative)

In [9]:
df_lib_posts.to_csv('../data/liberal_posts.csv', index=False)
df_lib_comments.to_csv('../data/liberal_comments.csv', index=False)
df_cons_posts.to_csv('../data/conservative_posts.csv', index=False)
df_cons_comments.to_csv('../data/conservative_comments.csv', index=False)