# Collecting Reddit submissions

## Libraries

In [None]:
import praw
from tqdm import tqdm
from datetime import datetime
import sqlite3
import regex as re

import utils
from params import sql_db, discussionarchive_submissions_pushshift, reddit_praw_id

## Setup

In [None]:
# Define location and name of SQL database, create a connection object
conn = sqlite3.connect(sql_db)

# Creating a Reddit-instance in PRAW with my personal Reddit username, password etc.
# Before handing in the project I removed the praw.ini file from this folder, which is why it now gives an error message
# See: https://praw.readthedocs.io/en/latest/getting_started/configuration/prawini.html
reddit = praw.Reddit(reddit_praw_id)

In [None]:
with conn:
    submission_ids = utils.get_submission_ids(conn, 'submissions')

In [None]:
len(submission_ids)

## Collect submissions

In [None]:
print(f"Starting at {datetime.now()}")
submissions_data = []

for sort_type in tqdm(["relevance", "hot", "top", "new", "comments"]):
    for submission in reddit.subreddit('movies').search("Official Discussion", sort=sort_type, limit=1000):
        
        submission_list = [
            submission.id,
            submission.title,
            submission.score,
            submission.num_comments,
            submission.url,
            int(submission.created_utc)
        ]

        submissions_data.append(submission_list)

In [None]:
print(submissions_data[:2])

In [None]:
new_submissions = []
set_submissions = set()

for submission in submissions_data:
    if submission[0] not in set_submissions:
        if submission[0] not in submission_ids:
            new_submissions.append(submission)
            set_submissions.add(submission[0])
        
print(len(new_submissions))

In [None]:
print(utils.interact_with_db(conn, "SELECT * FROM submissions LIMIT 5", "cur.fetchall()"))

In [None]:
n_submission_attributes = 6

with conn:
    n_submissions = utils.add_rows(conn, 'submissions', n_submission_attributes, submissions_data)

# Using r/discussionarchive

In [None]:
archive_data = set()

for submission in reddit.subreddit('discussionarchive').new(limit=1000):
    try:
        url = re.findall("comments/([^/]+)", submission.url)[0]
        archive_data.add(url)
    except:
        break
    
# There are only 812 submissions, so it crashes after that

In [None]:
len(archive_data)

## Include those found using pushshift (see other notebook)

In [None]:
with open(discussionarchive_submissions_pushshift,'r') as f:
    archive_pushshift = set(f.read().splitlines())
    
print(len(archive_pushshift))

In [None]:
archive_data.update(archive_pushshift)

print(len(archive_data))

## See what is newly found

In [None]:
archive_data.difference_update(submission_ids)

print(len(archive_data))

In [None]:
archive_data.difference_update(set_submissions)

print(len(archive_data))

In [None]:
archive_data = ['11lgm8']

## Collect their submission data

In [None]:
submissions_data = []

for sub_id in tqdm(archive_data):
    try:
        submission = reddit.submission(sub_id)

        submission_list = [
                submission.id,
                submission.title,
                submission.score,
                submission.num_comments,
                submission.url,
                int(submission.created_utc)
            ]

        submissions_data.append(submission_list)
    except:
        print(f"Something went wrong with {sub_id}")

In [None]:
print(submissions_data)

In [None]:
len(submissions_data)

In [None]:
n_submission_attributes = 6

with conn:
    n_submissions = utils.add_rows(conn, 'submissions', n_submission_attributes, submissions_data)

In [None]:
utils.interact_with_db(conn, "SELECT COUNT(*) FROM submissions", "cur.fetchone()")

In [None]:
sql = "DELETE FROM submissions WHERE submission_id IS NULL OR trim(submission_id) = '';"

with conn:
    utils.interact_with_db(conn, sql, commit=True)

In [None]:
utils.interact_with_db(conn, "SELECT COUNT(*) FROM submissions", "cur.fetchone()")