# Using Pushshift Module to extract Submissions Data from Reddit via Python

# Import modules

In [10]:
import ciso8601
import pandas as pd
import requests #Pushshift accesses Reddit via an url so this is needed
import json #JSON manipulation
import csv #To Convert final table into a csv file to save to your machine
import time
import datetime

# Pushshift URL Examples

In [40]:
#Adapted from this https://gist.github.com/dylankilkenny/3dbf6123527260165f8c5c3bc3ee331b
#This function builds an Pushshift URL, accesses the webpage and stores JSON data in a nested list
def getPushshiftData(query, after, before, sub):
    #Build URL
    url = 'https://api.pushshift.io/reddit/search/submission/?title='+str(query)+'&size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)
    #Print URL to show user
    print(url)
    #Request URL
    r = requests.get(url)
    #Load JSON data from webpage into data variable
    data = json.loads(r.text)
    #return the data element which contains all the submissions data
    return data['data']

# Extract key information from Submissions

We want key data for further analysis including:
* Submission Title
* URL
* Flair
* Author
* Submission post ID
* Score
* Upload Time
* No. of Comments
* Permalink.


In [41]:
#This function will be used to extract the key data points from each JSON result
def collectSubData(subm):
    #subData was created at the start to hold all the data which is then added to our global subStats dictionary.
    subData = list() #list to store data points
    title = subm['title']
    selftext = subm['selftext']
    sub_id = subm['id']
    created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0

    #Put all data points into a tuple and append to subData
    subData.append((sub_id,title,selftext,created))
    #Create a dictionary entry of current submission data and store all data related to it
    subStats[sub_id] = subData

# Update your Search Settings here

In [42]:
#Create your timestamps and queries for your search URL
#https://www.unixtimestamp.com/index.php > Use this to create your timestamps
after = "1577836800" #int(time.mktime(ciso8601.parse_datetime(date_after).timetuple())) #Submissions after this timestamp
before = "1607040000" #int(time.mktime(ciso8601.parse_datetime(date_after).timetuple())) #Submissions after this timestamp
query = "COVID" #Keyword(s) to look for in submissions
sub = "COVID" #Which Subreddit to search in

#subCount tracks the no. of total submissions we collect
subCount = 0
#subStats is the dictionary where we will store our data.
subStats = {}

In [43]:
# We need to run this function outside the loop first to get the updated after variable
data = getPushshiftData(query, after, before, sub)
# Will run until all posts have been gathered i.e. When the length of data variable = 0
# from the 'after' date up until before date
while len(data) > 0: #The length of data is the number submissions (data[0], data[1] etc), once it hits zero (after and before vars are the same) end
    for submission in data:
        collectSubData(submission)
        subCount+=1
    # Calls getPushshiftData() with the created date of the last submission
    print(len(data))
    print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
    #update after variable to last created date of submission
    after = data[-1]['created_utc']
    #data has changed due to the new after variable provided by above code
    data = getPushshiftData(query, after, before, sub)

print(len(data))

https://api.pushshift.io/reddit/search/submission/?title=COVID&size=1000&after=1577836800&before=1607040000&subreddit=COVID
100
2020-03-26 20:59:00
https://api.pushshift.io/reddit/search/submission/?title=COVID&size=1000&after=1585274340&before=1607040000&subreddit=COVID
100
2020-04-01 21:53:37
https://api.pushshift.io/reddit/search/submission/?title=COVID&size=1000&after=1585796017&before=1607040000&subreddit=COVID
100
2020-04-08 04:55:42
https://api.pushshift.io/reddit/search/submission/?title=COVID&size=1000&after=1586339742&before=1607040000&subreddit=COVID
100
2020-04-12 07:54:14
https://api.pushshift.io/reddit/search/submission/?title=COVID&size=1000&after=1586696054&before=1607040000&subreddit=COVID
100
2020-04-17 12:47:55
https://api.pushshift.io/reddit/search/submission/?title=COVID&size=1000&after=1587145675&before=1607040000&subreddit=COVID
100
2020-04-23 15:58:54
https://api.pushshift.io/reddit/search/submission/?title=COVID&size=1000&after=1587675534&before=1607040000&subr

# Check your Submission Extraction was successful

In [44]:
num_submissions = len(subStats)
print(str(num_submissions) + " submissions have added to list")
print("1st entry is:")
print(list(subStats.values())[0][0][1])
print("Last entry is:")
print(list(subStats.values())[-1][0][1])

3398 submissions have added to list
1st entry is:
Coronavirus disease named Covid-19
Last entry is:
Covid Help - Testing - How To Properly Quarantine?


# Save data to CSV file

In [23]:
def updateSubs_file():
    upload_count = 0
    ts = str(round(time.time() * 1000))
    file = "data/reddit_" + str(num_submissions) +"_" + ts + ".csv"
    with open(file, 'w', newline='', encoding='utf-8') as file:
        a = csv.writer(file, delimiter=',')
        headers = ["Post ID","Title","Text","Publish Date"]
        a.writerow(headers)
        for sub in subStats:
            a.writerow(subStats[sub][0])
            upload_count+=1

        print(str(upload_count) + " submissions have been uploaded")
updateSubs_file()

40 submissions have been uploaded


# Comments

