## 1) Importing Libraries

In [1]:
## Import libaries
import pandas as pd
import numpy as np
import requests
import json
import time
import os
from tqdm import tqdm
from IPython.display import clear_output
from bs4 import BeautifulSoup
pd.set_option('display.max_colwidth', -1)

## 2) Pulling Reddit Data
Data for posts is pulled through the Reddit API. For each post, the permalink is used to do an additional webscrape of each post's comments page. All comments are appended as a single comment block in a column of the final dataframe.

In [2]:
#### Function to pull a maximum of ~2,500 reddit posts and all associated comments
def reddit_pull(url, max_pull_size):
    
    ## Setting global variables so they can be used in later functions
    global errors
    
    ## Establishing login creds and empty lists
    errors = []
    posts_list = []
    
    ## Setting after to random string so that loop does not break
    after = 'randomstring'
    
    print(f'Pulling post data for up to {max_pull_size} posts...\n') ## Sign posting progress
    
    ## Continuing to loop until there are as many posts as the defined/passed max pull size
    while len(posts_list) < max_pull_size:
        if after == None: 
            break ## Breaks loop when last post is reached
            
        ## After indicates to reddit API where I want to start pulling after
        res = requests.get(url, params = {'after' : after}, headers={'User-agent' : 'jimtronic'})
        json_pull = res.json() ## Storing pull as JSON file
        
        ## Add cleaned version of each of the pulled posts to the post list
        posts_list.extend(data_structure_posts(json_pull['data']['children']))
        
        after = json_pull['data']['after'] ## Setting 'after' to indicate where to start next json pull
        time.sleep(1) ## Pausing so API doesn't throttle us
        print(f"Total Posts Pulled: {len(posts_list)}")
    
    clear_output()
    
    print(f'Pulling comments data for {len(posts_list)} posts...') ## User sign posting
    for i, post in enumerate(posts_list):
        if i % 50 == 0:
            print(f'Pulling comments data for post {i+1}-{min(i+50,len(posts_list))} of {len(posts_list)}')
        post = append_comms(post)
        
    clear_output()
    
    print(f"Total Posts Pulled: {len(posts_list)}\n" + f"\nComment Page Errors: {errors}")
    
    ## Returning list of dicts for both posts and comments
    return posts_list

################################
## MAY ADD LATER
## Function to remove duplicates...
## Function to retry errors
################################

In [3]:
#### Function to append comments as a single text block associated with a post
def append_comms(post):

    comments_list = []
    num_deleted_comments = 0 ## Keeps track deleted comments
    
    com_url = post['permalink'][:-1] + '.html' ## Storing the post's comment page url as "com_url"
    com_res = requests.get(com_url, headers={'User-agent' : 'jimtronic'}) ## Getting data on comment page
    com_soup = BeautifulSoup(com_res.content, 'html') ## Bringing data into Beautiful Soup

    ## Capturing the entire comments section
    comment_section = com_soup.find('div', {'class':'p0SYO8TbZVqJIWEeFcNZx'})

    ############################
    #### ERROR HANDLING
    try: ## Try to find a comment block
        comment_section.find('div', {'class' : 'Comment'})

    ## If comment blocks not found (None type), there's an error with that page
    except:
        print('Page error: ' + str(com_url)) ## Let user know
        errors.append(com_url) ## Append that broken URL so user can see it
        post['comments_text'] = '' ## Append blank comment text
        post['num_deleted_comments'] = 0 ## 0 Comments deleted from post
        return ''
    ############################
    
    ## Iterating through each comment on the comments page
    for com in comment_section.find_all('div', {'class' : 'Comment'}):
        

        try: ## Try adding the user name and comment text
            ## Removing auto-mod comments
            if com.find('div', {'class' : 'xvda30-0 camSYk'}).text != 'AutoModerator': 
                comments_list.append(com.find('div', {'data-test-id': 'comment'}).text.replace(',',''))
        except: ## If it can't, it's b/c the comment was removed
            text = ''
            num_deleted_comments =+1

    comment_string = ' '.join(comments_list) ## Joins each comment into one string block
    post['comments_text'] = comment_string ## Sets comment block as value in the specific post
    post['num_deleted_comments'] = num_deleted_comments ## Sets deleted comments in specific post
    return post

################################
## MAY ADD LATER
## Use the API instead of switching to web scraping?...
################################

In [4]:
#### Takes list of post data and returns organized structure of useful data
def data_structure_posts(posts_list):
    clean_posts_list = [] ## Where useful data will be stored
    for post in posts_list: ## Loops through a list of posts
        clean_posts_list.append({
            'up_votes' : post['data']['ups'],
            'down_votes' : post['data']['downs'],
            'title' : post['data']['title'].replace(',',''),
            'text' : post['data']['selftext'].replace(',',''),
            'author' : post['data']['author'],
            'num_crossposts' : post['data']['num_crossposts'],
            'is_crosspostable' : post['data']['is_crosspostable'],
            'num_comments' : post['data']['num_comments'],
            'score' : post['data']['score'],
            'permalink' : 'https://www.reddit.com' + str(post['data']['permalink']),
            'name' : post['data']['name'],
            'url' : post['data']['url']
        })
    return clean_posts_list

In [5]:
url_conservative = 'https://www.reddit.com/r/conservative.json'
posts_conservative = reddit_pull(url_conservative, max_pull_size = 2000)

Total Posts Pulled: 826

Comment Page Errors: ['https://www.reddit.com/r/Conservative/comments/b7k4r7/us_struggling_with_growing_number_of_asylum.html', 'https://www.reddit.com/r/Conservative/comments/b7hwtq/williams_more_university_corruption.html']


In [6]:
url_liberal = 'https://www.reddit.com/r/liberal.json'
posts_liberal = reddit_pull(url_liberal, max_pull_size = 2000)

Total Posts Pulled: 568

Comment Page Errors: []


In [7]:
df_cons_posts = pd.DataFrame(posts_conservative)
df_lib_posts = pd.DataFrame(posts_liberal)

## 3) Exporting the Data
PKL files were the preferred export – CSV experiences data loss since some comment blocks exceed CSV's 32,767 character limit. 

In [8]:
df_lib_posts.to_csv('../data/liberal_posts_comms.csv', index=False)
df_cons_posts.to_csv('../data/conservative_posts_comms.csv', index=False)

In [9]:
## Saving to pkl file because some comment blocks exceed CSV limits
df_lib_posts.to_pickle('../data/liberal_posts_comms.pkl')
df_cons_posts.to_pickle('../data/conservative_posts_comms.pkl')

## 4) Notes & Considerations
- DataFrames may need to be de-duped
- Relying solely on the API, rather than doing a webscrape, may be the preferred way of eliminating errors
- If not, adding a function to retry errors may work