In [2]:
import time
import requests
import pandas as pd
from datetime import datetime, timedelta

pd.set_option('display.width', 5000)

In [3]:
def daterange(start_date, end_date):
  for day in range(int((end_date - start_date).days)):
    yield start_date + timedelta(day)

def timerange(start_time, end_time):
  for time_step in range(int((end_time - start_time).days*2)):
    yield start_time + timedelta(hours=12*time_step)

In [5]:
#sanity checks for the endpoints
test_request = requests.get('https://api.pushshift.io/reddit/search/submission', params={'q':'hurts',
                                                                                         'sort':'desc', 
                                                                                         'sort_type': 'score',
                                                                                         'size':10, 
                                                                                         'metadata':'true',
                                                                                         'subreddit': 'askdocs',
                                                                                         'fields': ['title', 'selftext', 'subreddit', 'score', 'upvote_ratio',
                                                                                                    'total_awards_received', 'full_link', 'link_flair_text', 
                                                                                                    'author', 'id', 'permalink', 'url', 'num_comments']})
test_json = test_request.json()
test_request

<Response [200]>

In [None]:
#Submissions endpoint
responses_df = pd.DataFrame(columns=['title', 'selftext', 'subreddit', 'score', 'upvote_ratio', 'total_awards_received', 'full_link', 'link_flair_text', 'author', 'id', 'permalink', 'url', 'num_comments'])
submission_endpoint = 'https://api.pushshift.io/reddit/search/submission'

params = {
    'q':'',
    'sort':'desc',
    'sort_type':'score',
    'size':100,
    'after':None,
    'before':None,
    'metadata':'true',
    'fields': ['title', 'selftext', 'subreddit', 'score', 'upvote_ratio', 'total_awards_received', 
               'full_link', 'link_flair_text', 'author', 'id', 'permalink', 'url', 'num_comments']
}

In [None]:
#24h collection
start_date, end_date = datetime(2022, 9, 23, 0, 0), datetime(2022, 9, 30, 0, 0)
for date in daterange(start_date, end_date):
  params['after'], params['before'] = date, date + timedelta(days=1)

  response = requests.get(submission_endpoint, params=params)
  data, metadata = response.json()['data'], response.json()['metadata']
  if not data:
    print(date.date(), 'empty')
  else:
    print(date.date(), metadata['size'], metadata['shards'], metadata['timed_out'], metadata['size'])
  
  [item.update({'date': date.date()}) for item in data]
  responses_df = responses_df.append(data, ignore_index=True)

  time.sleep(5)

#save_to_csv
responses_df.to_csv('file_name_submissions.csv', index=False)

2022-09-23 100 {'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4} False 100
2022-09-24 100 {'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4} False 100
2022-09-25 100 {'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4} False 100
2022-09-26 100 {'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4} False 100
2022-09-27 100 {'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4} False 100
2022-09-28 100 {'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4} False 100
2022-09-29 100 {'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4} False 100


In [None]:
#Comments endpoint
responses_comments_df = pd.DataFrame(columns=['author', 'body', 'id', 'link_id', 'permalink', 'score', 'subreddit', 'is_submitter', 'total_awards_received'])
comment_endpoint = 'https://api.pushshift.io/reddit/search/comment'

params = {
    'q':'',
    'sort':'desc',
    'sort_type':'score',
    'size':100,
    'after':None,
    'before':None,
    'metadata':'true',
    'fields': ['author', 'body', 'id', 'link_id', 'permalink', 'score', 'subreddit', 'is_submitter', 'total_awards_received']
}

In [None]:
#24h collection
start_date, end_date = datetime(2017, 10, 9, 0, 0), datetime(2017, 10, 16, 0, 0)
for date in daterange(start_date, end_date):
  params['after'], params['before'] = date, date + timedelta(days=1)

  response = requests.get(comment_endpoint, params=params)
  data, metadata = response.json()['data'], response.json()['metadata']
  if not data:
    print(date, 'empty')
  else:
    print(date, metadata['size'], metadata['shards'], metadata['timed_out'], metadata['size'])
  
  [item.update({'date': date.date()}) for item in data]
  responses_comments_df = responses_comments_df.append(data, ignore_index=True)

  time.sleep(5)

#save to csv
responses_comments_df.to_csv('file_name_submissions_comments.csv', index=False)

2017-10-09 00:00:00 100 {'failed': 0, 'skipped': 0, 'successful': 67, 'total': 74} False 100
2017-10-10 00:00:00 100 {'failed': 0, 'skipped': 0, 'successful': 67, 'total': 74} False 100
2017-10-11 00:00:00 100 {'failed': 0, 'skipped': 0, 'successful': 67, 'total': 74} False 100
2017-10-12 00:00:00 100 {'failed': 0, 'skipped': 0, 'successful': 67, 'total': 74} False 100
2017-10-13 00:00:00 100 {'failed': 0, 'skipped': 0, 'successful': 67, 'total': 74} False 100
2017-10-14 00:00:00 100 {'failed': 0, 'skipped': 0, 'successful': 67, 'total': 74} False 100
2017-10-15 00:00:00 100 {'failed': 0, 'skipped': 0, 'successful': 67, 'total': 74} False 100
