# Collecting Reddit submissions

## Libraries

In [None]:
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import regex as re
import sqlite3
import requests

import utils
from params import sql_db, discussionarchive_submissions_pushshift

## Setup

In [None]:
# Define search parameters
# the keys are the parameter names (see https://pushshift.io/api-parameters/ for possible parameters)
param_dict = {'subreddit':'discussionarchive',
              'size':1000, # 1000 is the maximum number that can be collected per single request. No reason to change this.
              'is_self': "false",
             }

# Keys to collect from submissions
submission_keys = ('id', 'title', 'score', 'num_comments', 'url', 'created_utc')

# Define submission_limit, the number of submissions to be obtained by the API
submission_limit = 100000

# Define location and name of SQL database, create a connection object
conn = sqlite3.connect(sql_db)

In [None]:
with conn:
    submission_ids = utils.get_submission_ids(conn, 'submissions')

## Collect submissions

In [None]:
# Based on: https://github.com/SeyiAgboola/Reddit-Data-Mining/blob/master/Using_Pushshift_Module_to_extract_Submissions.ipynb
print(f"Starting at {datetime.now()}")
sub_count = 0
archive_data = set()

# Collect first set of submissions
# We need to run this function outside the loop first to get the updated before variable
data = utils.get_pushshift_data(param_dict)

print(f"The youngest submission that fits the criteria is from: {datetime.fromtimestamp(data[0]['created_utc'])}")

while len(data) > 0: 
    if sub_count < submission_limit:
        
        for submission in data:
            try:
                url = re.findall("comments/([^/]+)", submission['url'])[0]
                archive_data.add(url)
            except:
                continue
        
        # Set the new 'before' parameter
        param_dict['before'] = data[-1]['created_utc']

        # Collect next set of submissions
        data = utils.get_pushshift_data(param_dict)
        
        sub_count += 100

    else:
        print(f"Reached submission limit at {datetime.now()}")
        print(f"Didn't collect submissions posted before {datetime.fromtimestamp(param_dict['before'])}")
        break
    

print(f"Finished at {datetime.now()}")

In [None]:
print(len(archive_data))

In [None]:
with open(discussionarchive_submissions_pushshift,'w') as f:
    for i in archive_data:
        f.write(i)
        f.write('\n')

In [None]:
print(len(submission_ids))

In [None]:
archive_data.difference_update(submission_ids)

print(len(archive_data))

## Find submission metadata

Only works for 13 submissions. Pushshift doesn't seem to be complete

In [None]:
ids = ",".join(archive_data)
url= f"https://api.pushshift.io/reddit/search/submission/?ids={ids}"

r = requests.get(url)
data = r.json()['data']

In [None]:
for i in data:
    print(i['title'])