# Reddit Scraper

## Generic query function for the pushshift.io Reddit API 

In [1]:
import requests

def query_reddit(subreddit = '', type = 'comment', username = '',
                 before = '', link_id='', size = 1000):

    params = {}
    fields = ['subreddit', 'created_utc', 'body', 'author',
                     'title', 'selftext',
                     'link_id', 'id', 'permalink']

    #params['fields'] = fields
    params['size'] = size

    if before:
        params['before'] = before
    if username:
        params['author'] = username
    if subreddit:
        params['subreddit'] = subreddit
    if link_id:
        params['link_id'] = link_id
    response = requests.get(
        'https://api.pushshift.io/reddit/' + type + '/search',
        params=params
    )
    try:
        if 'data' in response.json():
            return response.json()['data']
        else:
            return {}
    except:
        return {}


## Testing pad

In [50]:
# begin = ''
# while True:
#     doc = query_reddit('submission', 'AutoModerator', before = begin)
#     length = len(doc)
#     begin = doc[length - 1]['created_utc']  if length > 0 else ''
#     print(length)
#     print(begin)
#     print('_________')
#     if not begin:
#         break

# doc = query_reddit(type = 'submission', username = 'AutoModerator')
# length = len(doc)
# begin = doc[length - 1]['created_utc']  if length > 0 else ''
# print(length)
# print(begin)
# print('_________')


## Functions to poll submissions and comments

# get_submissions:
id: ID of the submission. This is called link_id in comments.
# get_comments:
link_id: ID of the submission where the comment was made.

In [19]:
from collections import defaultdict
import itertools
from IPython.display import display, clear_output

all_submissions = []
all_comments = []

def get_submissions(subreddit):
    loading_text = "~"
    begin = ''
    while True:
#     for _ in itertools.repeat(None, 1):
        doc =  query_reddit(subreddit=subreddit, type='submission', before=begin)
        length = len(doc)
        begin = doc[length - 1]['created_utc']  if length > 0 else ''
        if not begin:
            break
        for submission in doc: 
            all_submissions.append(submission)
        clear_output(wait=True)
        display(loading_text)
        if len(loading_text) < 10:
            loading_text = '~' + loading_text
        else:
            loading_text = '~'
    with open('data/submissions_'+ subreddit + '.data', 'w') as f:
        for item in all_submissions:
            f.write("%s\n" % item)
        f.close()
    print("Ding!")


def get_comments(subreddit):
    loading_text = "~"
    begin = ''
    while True:
#     for _ in itertools.repeat(None, 10):
        doc =  query_reddit(subreddit=subreddit, type='comment', before=begin)
        length = len(doc)
        begin = doc[length - 1]['created_utc']  if length > 0 else ''
        if not begin:
            break
        for comments in doc:
            all_comments.append(comments)
        clear_output(wait=True)
        display(loading_text)
        if len(loading_text) < 10:
            loading_text = '~' + loading_text
        else:
            loading_text = '~'

    with open('data/comments_'+ subreddit + '.data', 'w') as f:
        for item in all_comments:
            f.write("%s\n" % item)
        f.close()
    print("Ding!")

## Get all comments of all submissions (crashes randomly?)

In [None]:
comments_under_submissions = []
def comments_from_submissions():

    f = open('comments_of_submissions_news.csv', 'w')
    for submission in all_submissions:
        begin = ''
        entry = {}
        entry['link_id'] = submission['id']
        entry['comments'] = []
        if 'id' in submission:
            while True:
                doc =  query_reddit(subreddit='news', type='comment',
                                    link_id = submission['id'], before = begin)
                if doc:
                    begin = doc[-1]['created_utc']
                else:
                    break
                for comment in doc:
                    entry['comments'].append(comment)
        f.write("%s\n" % entry)
        print(entry['comments'])
    f.close()
    print("Done!")

## Creating panda DataFrame from .data file and safe the DataFrame
feather and parquet not working (pyarrow has not fully implemented them yet)

In [19]:
import ast 
from pandas import DataFrame

def convert_data(file_name):
    file = open(file_name, 'r') 
    lines = [ast.literal_eval(line) for line in file.readlines()]
    df = DataFrame(lines)
    df.to_pickle(file_name + '.pickle')
    file.close()


In [27]:
import pandas
# get_submissions('StarWars')
# get_comments('StarWars')
# comments_from_submissions()
# submissions = pandas.read_pickle('data/submissions_star_wars.data.pickle')
# comments = pandas.read_pickle('data/comments_star_wars.data.pickle')
comments

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,send_replies,stickied,subreddit,subreddit_id,top_awarded_type,total_awards_received,treatment_tags,distinguished,edited,author_cakeday
0,[],,Ron-Swanson-Mustache,,,[],,,,text,...,True,False,StarWars,t5_2qi4s,,0,[],,,
1,[],,[deleted],,,,,,dark,,...,True,False,StarWars,t5_2qi4s,,0,[],,,
2,[],,Greedyocracy,,,[],,,,text,...,True,False,StarWars,t5_2qi4s,,0,[],,,
3,[],,packetmon,,,[],,,,text,...,True,False,StarWars,t5_2qi4s,,0,[],,,
4,[],,bielarex,,,[],,,,text,...,True,False,StarWars,t5_2qi4s,,0,[],,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36695,[],,TrotskyStalin,,1-12 bigFlair,[],f4f5068c-77ed-11e9-add5-0e9fb131d2e8,Clone Trooper,dark,text,...,True,False,StarWars,t5_2qi4s,,0,[],,,
36696,[],,VaultBoy42,,3-10 bigFlair,[],364eda62-77ef-11e9-939b-0e1ee5586816,Lando Calrissian,dark,text,...,True,False,StarWars,t5_2qi4s,,0,[],,,
36697,[],,Revenant_Ascent,,,[],,,,text,...,True,False,StarWars,t5_2qi4s,,0,[],,,
36698,[],,persistentInquiry,,,[],,,,text,...,True,False,StarWars,t5_2qi4s,,0,[],,,
