# Scraping r/vegan and r/carnivore - Kyle Ness

### In this file, we will be gathering reddit posts from r/vegan and r/carnivore via the pushshift api (https://github.com/pushshift/api). These posts will later be used for the fitting and evaluation of two classifiers using NLP methods.

In [77]:
#Imports
import requests as req
import pandas as pd
import numpy as np
import time

In [100]:
#Here two functions are created: a helper for pulling 100 posts at a time from a specified subreddit, and a higher level function that executes the 
#helper 10 times. End result = roughly 1,000 - 1,100 posts gathered from reddit per call.

def pull_data(subreddit, before): #designed as a helper function to 'build' below
    
    print('Pull start') #Just using this as a status check, along with the other print statements seen below.
    url = 'https://api.pushshift.io/reddit/search/submission'
    scrape_list = []
    post_count = 0
    
    #Here a while loop is used because it is unknown how many valid posts (that is, ones with descriptions) will be returned per request.
    #100 unfortunately had to be used because 1000, our target, would result in an error 'Too many requests'. Even 250 proved too much. 100 is safe.
    while post_count < 100:
        if req.get(url, {'subreddit': subreddit, 'size': 100, 'before': before}).status_code == 200:
            #Pull data from specified subreddit using requests library, convert to dataframe. Keep track of min datetime so as to go further back in next search
            posts_df = pd.DataFrame(req.get(url, {'subreddit': subreddit, 'size': 100, 'before': before}).json()['data'])

            #Take dataframe and drop all entries without a post description
            posts_w_descripts = posts_df[(posts_df['selftext'].str.len() != 0) & (posts_df['selftext'] != '[removed]') & (posts_df['selftext'].isna() == False)]

            #Append this dataframe to the list of all dataframes created via scraping
            scrape_list.append(posts_w_descripts[['subreddit', 'title', 'selftext']])
            print(f'Scrape {len(scrape_list)}')

            #Increment post_count by the number of valid entries scraped so that we know we're reaching our goal of 1,000 posts for analysis later.
            post_count += posts_w_descripts.shape[0]
            #Also set time to the new minimum observed in this scrape - makes sure we don't scrape the same data several times.
            before = posts_df['created_utc'].min()
            
        else:
            print('Please standby')
            time.sleep(5)
            
    #return a dataframe of the 100 or so valid posts collected      
    return (pd.concat(scrape_list, axis = 0), before)

def build(subreddit): #builds a dataframe of 1,000+ scraped posts from specified subreddit
    
    print('START') #Status check
    comb = []
    before = 1656400000 #Choosing a static start for this - issues were had trying time.time(). This time is quivalent to June 28th, 3 AM EST.
    
    for i in range(10):
        print(f'Iteration {i}') #Status check
        pull = pull_data(subreddit, before)
        comb.append(pull[0])
        before = pull[1]
        time.sleep(10) #Give some exctra time in between pulls to make sure no error comes up from the API.
        
    return pd.concat(comb, axis = 0)

In [79]:
vegan_df = build('vegan')

START
Iteration 0
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Iteration 1
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Scrape 5
Iteration 2
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Scrape 5
Iteration 3
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Scrape 5
Iteration 4
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Iteration 5
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Iteration 6
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Scrape 5
Iteration 7
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Scrape 5
Iteration 8
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Iteration 9
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Scrape 5


In [80]:
carn_df = build('carnivore')

START
Iteration 0
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Scrape 5
Iteration 1
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Scrape 5
Iteration 2
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Iteration 3
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Iteration 4
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Iteration 5
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Scrape 5
Iteration 6
Pull start
Scrape 1
Scrape 2
Scrape 3
Iteration 7
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Iteration 8
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4
Iteration 9
Pull start
Scrape 1
Scrape 2
Scrape 3
Scrape 4


In [81]:
print(vegan_df.shape)
print(carn_df.shape) #Successful! We have over 1,000 observations for each, with some spare room if duplicates exist.

(1139, 3)
(1097, 3)


In [82]:
vegan_df.drop_duplicates(inplace = True)
carn_df.drop_duplicates(inplace = True)

In [83]:
print(vegan_df.shape)
print(carn_df.shape) #Successful!

(1128, 3)
(1093, 3)


In [84]:
vegan_df.drop_duplicates(subset = 'selftext', inplace = True)
carn_df.drop_duplicates(subset = 'selftext', inplace = True) #For some reason, these did not fully work in the previous attempts

In [85]:
vegan_df['selftext'].value_counts().sort_values(ascending = False)[0]

1

In [86]:
carn_df['selftext'].value_counts().sort_values(ascending = False)[0] #Values of 1 are good --> means there are no duplicates!

1

In [87]:
print(vegan_df.shape)
print(carn_df.shape) #Now both have 1,050 observations

(1122, 3)
(1088, 3)


In [105]:
#Dropping last x rows such that each dataframe has 1,050 observations (50 over 1,000 for some buffer in case errors come up)
vegan_df_final = vegan_df[:1050]
carn_df_final = carn_df[:1050]

In [106]:
print(vegan_df_final.shape)
print(carn_df_final.shape) #Now both have 1,050 observations

(1050, 3)
(1050, 3)


In [107]:
#Finally, save our assembled datasets to csv so that we can use them elsewhere.
vegan_df_final.set_index('subreddit', inplace = True)
carn_df_final.set_index('subreddit', inplace = True)
vegan_df_final.to_csv('../datasets/vegan.csv')
carn_df_final.to_csv('../datasets/carnivore.csv')