# Collecting Reddit submissions

## Libraries

In [53]:
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import regex as re
import sqlite3
import requests

import importlib

import utils

In [101]:
importlib.reload(utils)

<module 'utils' from 'D:\\Python\\Thesis\\utils.py'>

## Setup

In [70]:
# Define search parameters
# the keys are the parameter names (see https://pushshift.io/api-parameters/ for possible parameters)
param_dict = {'subreddit':'discussionarchive',
              'size':1000, # 1000 is the maximum number that can be collected per single request. No reason to change this.
              'is_self': "false",
             }

# Keys to collect from submissions
submission_keys = ('id', 'title', 'score', 'num_comments', 'url', 'created_utc')

# Define submission_limit, the number of submissions to be obtained by the API
submission_limit = 100000

# Define location and name of SQL database, create a connection object
sql_db = './data/film_discussions'
conn = sqlite3.connect(sql_db)

discussionarchive_submissions_pushshift = 'data/discussionarchive_submissions_pushshift.txt'

In [8]:
with conn:
    submission_ids = utils.get_submission_ids(conn, 'submissions')

## Collect submissions

In [71]:
# Based on: https://github.com/SeyiAgboola/Reddit-Data-Mining/blob/master/Using_Pushshift_Module_to_extract_Submissions.ipynb
print(f"Starting at {datetime.now()}")
sub_count = 0
archive_data = set()

# Collect first set of submissions
# We need to run this function outside the loop first to get the updated before variable
data = utils.get_pushshift_data(param_dict)

print(f"The youngest submission that fits the criteria is from: {datetime.fromtimestamp(data[0]['created_utc'])}")

while len(data) > 0: 
    if sub_count < submission_limit:
        
        for submission in data:
            try:
                url = re.findall("comments/([^/]+)", submission['url'])[0]
                archive_data.add(url)
            except:
                continue
        
        # Set the new 'before' parameter
        param_dict['before'] = data[-1]['created_utc']

        # Collect next set of submissions
        data = utils.get_pushshift_data(param_dict)
        
        sub_count += 100

    else:
        print(f"Reached submission limit at {datetime.now()}")
        print(f"Didn't collect submissions posted before {datetime.fromtimestamp(param_dict['before'])}")
        break
    

print(f"Finished at {datetime.now()}")

Starting at 2022-02-12 17:55:02.734003
The youngest submission that fits the criteria is from: 2022-01-15 00:02:24
Finished at 2022-02-12 17:55:30.538722


In [72]:
print(len(archive_data))

831


In [73]:
with open(discussionarchive_submissions_pushshift,'w') as f:
    for i in archive_data:
        f.write(i)
        f.write('\n')

In [19]:
print(len(submission_ids))

1774


In [20]:
archive_data.difference_update(submission_ids)

print(len(archive_data))

168


In [30]:
archive_data

{'195vjx',
 '196cjz',
 '1ew8kd',
 '1fdwlf',
 '1fdx03',
 '1gbcth',
 '1grz85',
 '1grzrc',
 '1hl7hz',
 '1i4o8n',
 '1j2um6',
 '1k05aj',
 '1kgqf9',
 '1kx7yg',
 '1n83l8',
 '1n83za',
 '1np3cf',
 '1o82lz',
 '1pnuw3',
 '1r6vqg',
 '1rs512',
 '1s96c5',
 '1tbxm2',
 '1tbxrn',
 '1tpbj4',
 '1tpbsi',
 '1uwxhh',
 '1v2iqp',
 '1xobpv',
 '1ze5f8',
 '1zesnn',
 '20ejot',
 '23bo2s',
 '24ibyf',
 '2b2k07',
 '2e8kte',
 '2ev8ss',
 '2gtkdi',
 '3n03k6',
 '3x2oyf',
 '487kb1',
 '488mq3',
 '4cchdw',
 '5mvnt5',
 '5wdqod',
 '5wevxz',
 '6tyynf',
 '6vzaae',
 '706ydq',
 '73g2fx',
 '761r2y',
 '769d5d',
 '77j3ty',
 '795pgl',
 '7agfes',
 '7doxvl',
 '7eoph7',
 '7gs900',
 '7ik3cb',
 '7ldw3m',
 '7mrl6t',
 '7ovyyn',
 '7ptn7y',
 '7t1ndd',
 '7xvumf',
 '81bwds',
 '81cz97',
 '833jvx',
 '8mmla7',
 '8pa372',
 '8pg3cd',
 '8r7d6q',
 '8yf1pd',
 '928hxl',
 '92ij8o',
 '962sh1',
 '9bplvq',
 '9dpqjg',
 '9fxw74',
 '9n9qjv',
 '9pf3kg',
 '9r6tcs',
 '9t6qga',
 '9vg4og',
 'a1huet',
 'a4qc91',
 'a5mkir',
 'a6qtnh',
 'a9gjba',
 'agxe41',
 'ajfym7',

## Find submission metadata

Only works for 13 submissions. Pushshift doesn't seem to be complete

In [59]:
ids = ",".join(archive_data)
url= f"https://api.pushshift.io/reddit/search/submission/?ids={ids}"

r = requests.get(url)
data = r.json()['data']

In [67]:
for i in data:
    print(i['title'])

Reminder: /r/movies 31 Days of Horror starts tomorrow with Suspiria!
Official Late-Comer Megathread - Batman v. Superman: Dawn of Justice [SPOILERS]
Official Oscar post game thread 2016
/r/movies Oscars 2017: Official Post-Game Thread
Official Oscar Thread 2014
Official Live Thread - World Premiere of "Star Wars: The Force Awakens" [SPOILERS]
Official Oscar Thread 2016
/r/movies Golden Globes 2016: Official Post-Game Thread
Official Oscar Thread 2017
/r/movies Oscars 2019: Official Post-Game Thread
Live Thread - Avengers: Endgame [SPOILERS]
Official Oscars Thread 2019
/r/movies Official 2018 Golden Globes Post-Game Thread
