Python Notebook to crawl subreddit data using the Pushshift API. </br>
It takes the subreddit name, and the date range for which the data needs to be crawled.

In [None]:
#Download and import all dependancies
!pip install datetime #for collab to download the datetime lib
import requests
from datetime import datetime as dt, timezone
from google.colab import drive
import traceback
import time
import json 
import csv 

Collecting datetime
[?25l  Downloading https://files.pythonhosted.org/packages/73/22/a5297f3a1f92468cc737f8ce7ba6e5f245fcfafeae810ba37bd1039ea01c/DateTime-4.3-py2.py3-none-any.whl (60kB)
[K     |█████▌                          | 10kB 16.4MB/s eta 0:00:01[K     |███████████                     | 20kB 2.9MB/s eta 0:00:01[K     |████████████████▍               | 30kB 4.1MB/s eta 0:00:01[K     |█████████████████████▉          | 40kB 4.4MB/s eta 0:00:01[K     |███████████████████████████▎    | 51kB 3.4MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 2.6MB/s 
Collecting zope.interface
[?25l  Downloading https://files.pythonhosted.org/packages/c5/89/1eb9dbb9e24f5e2c29ab1a88097b2f1333858aac3cd3cccc6c4c1c8ad867/zope.interface-5.1.2-cp36-cp36m-manylinux2010_x86_64.whl (236kB)
[K     |████████████████████████████████| 245kB 12.8MB/s 
Installing collected packages: zope.interface, datetime
Successfully installed datetime-4.3 zope.interface-5.1.2


In [None]:
#Mount Google drive, provide auth code to write/read from G drive. You don't need to mount this if running this notebook on local.
drive.mount("/drive")

Mounted at /drive


In [None]:
# Formatted pushshiftUrl 
pushshiftApiUrl = "https://api.pushshift.io/reddit/submission/search/?after={}&subreddit={}&size=100&sort_type=created_utc&sort=asc&fields=author,author_fullname,created_utc,domain,full_link,is_crosspostable,link_flair_text,num_comments,num_crossposts,over_18,permalink,score,selftext,title,total_awards_received"

In [None]:
def crawlSubredditPosts(filename,subredditname,startDateTimestamp, endDateTimestamp):
	count = 0
	headcount =0
	fileop = open(filename, 'w') #overwrite existing file or create file with name <filename>
	csv_writer = csv.writer(fileop)
	previous_epoch = startDateTimestamp
	headers = ["author","author_fullname","created_utc","domain","full_link","is_crosspostable","link_flair_text","num_comments","num_crossposts","over_18","permalink","score","selftext","title","total_awards_received"]
    
	while previous_epoch < endDateTimestamp:
		new_url = pushshiftApiUrl.format(previous_epoch,subredditname)
		print("Hitting :  ", new_url)
        
		json = requests.get(new_url, headers={'User-Agent': "Bot downloader"})
		time.sleep(2) # pushshift has a SLA rate limit, if we send requests too fast it will start returning error messages
		json_data = json.json()
		if 'data' not in json_data:
			break
    
		objects = json_data['data']
		if len(objects) == 0:
			break
		#print(objects)
		
		for object in objects:
			previous_epoch = object['created_utc'] - 1 #for next epoch. since we are getting data in asending order we'll get next set of values using this time-stamp
			#print("previous_epoch ",previous_epoch)
			count += 1
			
			object['created_utc']=dt.utcfromtimestamp(object['created_utc']).strftime("%Y-%m-%d") #covert date to human readable format
			
			#print(object.keys())
			new_obj ={} #to write values in csv in right order
			for header in headers:
			    if header in object.keys():
			        new_obj[header]=object[header]
			    else:
			        new_obj[header]="NA"
			
			#print(new_obj)
			if headcount == 0:
			    csv_writer.writerow(headers)
			    headcount+=1 #write headers only once
			csv_writer.writerow(new_obj.values())
			

		print("Saved {} record(s) till date {}".format(count, dt.utcfromtimestamp(previous_epoch).strftime("%Y-%m-%d")))

	fileop.close()

In [None]:
startDate = dt(2020, 1, 1) #start date
startDateTimestamp = int(startDate.replace(tzinfo=timezone.utc).timestamp())
endDate = dt(2020, 4, 1) #end date
endDateTimestamp = int(endDate.replace(tzinfo=timezone.utc).timestamp())
print("startDateTimestamp ",startDateTimestamp)
print("endDateTimestamp ",endDateTimestamp)

subredditname = "emacs"
#Passing the drive path, please mention expected if running the notebook in local filesys
#Note due to the limits of the Pushshift API the function fetches 100 posts per API call, so need to filter extra posts if croseed the endDate limit due to paging of 100 posts.
crawlSubredditPosts("/drive/My Drive/Colab Notebooks/{}.{}".format(subredditname+"_raw", "csv"),subredditname,startDateTimestamp,endDateTimestamp)

startDateTimestamp  1577836800
endDateTimestamp  1585699200
Hitting :   https://api.pushshift.io/reddit/submission/search/?after=1577836800&subreddit=emacs&size=100&sort_type=created_utc&sort=asc&fields=author,author_fullname,created_utc,domain,full_link,is_crosspostable,link_flair_text,num_comments,num_crossposts,over_18,permalink,score,selftext,title,total_awards_received
Saved 100 record(s) till date 2020-01-07
Hitting :   https://api.pushshift.io/reddit/submission/search/?after=1578407316&subreddit=emacs&size=100&sort_type=created_utc&sort=asc&fields=author,author_fullname,created_utc,domain,full_link,is_crosspostable,link_flair_text,num_comments,num_crossposts,over_18,permalink,score,selftext,title,total_awards_received
Saved 200 record(s) till date 2020-01-14
Hitting :   https://api.pushshift.io/reddit/submission/search/?after=1578982532&subreddit=emacs&size=100&sort_type=created_utc&sort=asc&fields=author,author_fullname,created_utc,domain,full_link,is_crosspostable,link_flair_t