In [1]:
import pandas as pd
import requests
import json
from datetime import datetime

## Set Time Anchor

All data generated from this notebook will have been posted to reddit after the Time Anchor.

In [2]:
time_anchor = 1587230611
datetime.fromtimestamp(time_anchor)

datetime.datetime(2020, 4, 18, 12, 23, 31)

#### Function to Pull New Data

In [3]:
url = 'https://api.pushshift.io/reddit/search/submission'

def pull_posts_after(subreddit, n, final_destination, starting_when=time_anchor):
    '''The 'n' parameter determines the number of posts pulled in multiples of 500
        'final_destination' needs to be a list in which to store the posts'''
    for i in range(n):

        # set request parameters
        params = {
            'subreddit': f'{subreddit}',
            'size': 500,
            'after': starting_when
        }

        # create request
        res = requests.get(url, params)

        # print out status code each loop to ensure success
        print('Pulling down data... Status Code:', res.status_code)

        # convert to json, strip away outer layer to get only post data
        data = res.json()
        posts = data['data']

        # append to data
        final_destination.extend(posts)

        # set new timestamp for next loop
        starting_when = posts[0]['created_utc']

## Collecting Posts from r/askscience

In [4]:
data = []

pull_posts_after('askscience', 5, data)

print(f'{len(data)} new observations since Time Anchor.')

Pulling down data... Status Code: 200
Pulling down data... Status Code: 200
Pulling down data... Status Code: 200
Pulling down data... Status Code: 200
Pulling down data... Status Code: 200
2500 new observations since Time Anchor.


In [5]:
# check for duplicates
print(len([i['title'] for i in data]) - len(set([i['title'] for i in data])))

2004


## Collecting Posts from r/shittyaskscience

In [6]:
shitty_data = []

pull_posts_after('shittyaskscience', 1, shitty_data)

print(f'{len(shitty_data)} new observations since Time Anchor.')

Pulling down data... Status Code: 200
98 new observations since Time Anchor.


In [7]:
# check for duplicates
print(len([i['title'] for i in shitty_data]) - len(set([i['title'] for i in shitty_data])))

0


## Dataframe Creation

In [8]:
# build dataframes
df = pd.DataFrame(data)
df['subreddit'] = 'askscience'

shitty_df = pd.DataFrame(shitty_data)
shitty_df['subreddit'] = 'shittyaskscience'

# combine dataframes
combined_df = pd.concat([df, shitty_df])
combined_df.reset_index(drop=True)

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,author_cakeday,thumbnail_height,thumbnail_width,crosspost_parent,crosspost_parent_list,media,media_embed,secure_media,secure_media_embed,author_flair_template_id
0,[],False,BobSponge22,,[],,text,t2_1v74ayx0,False,False,...,,,,,,,,,,
1,[],False,LordNoOne,,[],,text,t2_npu3w,False,False,...,,,,,,,,,,
2,[],False,poltoid,,[],,text,t2_ndw65,False,False,...,,,,,,,,,,
3,[],False,blip99,,[],,text,t2_gbfmm,False,False,...,,,,,,,,,,
4,[],False,theTHIRDfckingacount,,[],,text,t2_59idcdkx,False,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2593,[],False,landonzy77,,[],,text,t2_1v8hbbfp,False,False,...,,140.0,140.0,,,,,,,
2594,[],False,Notchmath,,[],,text,t2_143nslu,False,False,...,,,,,,,,,,
2595,[],False,Kerbal_Guardsman,,[],,text,t2_c02ilw7,False,False,...,,,,t3_g6kb8a,"[{'all_awardings': [], 'allow_live_comments': ...",,,,,
2596,[],False,Vslightning,,[],,text,t2_g5ba1,False,False,...,,,,,,,,,,


## Exporting to .csv

In [9]:
# naming the file 'aftertimeanchor.csv' in order to keep track of old and new data to be collected in the future

combined_df.to_csv('./data/aftertimeanchor.csv', index=False)