# Collecting Reddit submissions

## Libraries

In [1]:
import praw
from tqdm import tqdm
from datetime import datetime
import sqlite3
import regex as re

import utils
from params import sql_db, discussionarchive_submissions_pushshift, reddit_praw_id

## Setup

In [2]:
# Define location and name of SQL database, create a connection object
conn = sqlite3.connect(sql_db)

# Creating a Reddit-instance in PRAW with my personal Reddit username, password etc.
# Before handing in the project I removed the praw.ini file from this folder, which is why it now gives an error message
# See: https://praw.readthedocs.io/en/latest/getting_started/configuration/prawini.html
reddit = praw.Reddit(reddit_praw_id)

Version 7.2.0 of praw is outdated. Version 7.5.0 was released Sunday November 14, 2021.


In [3]:
with conn:
    submission_ids = utils.get_submission_ids(conn, 'submissions')

In [10]:
len(submission_ids)

1774

## Collect submissions

In [18]:
print(f"Starting at {datetime.now()}")
submissions_data = []

for sort_type in tqdm(["relevance", "hot", "top", "new", "comments"]):
    for submission in reddit.subreddit('movies').search("Official Discussion", sort=sort_type, limit=1000):
        
        submission_list = [
            submission.id,
            submission.title,
            submission.score,
            submission.num_comments,
            submission.url,
            int(submission.created_utc)
        ]

        submissions_data.append(submission_list)

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Starting at 2022-02-14 11:45:28.672150


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.42s/it]


In [19]:
print(submissions_data[:2])

[['ri7eum', 'Official Discussion - Spider-Man: No Way Home [SPOILERS]', 13119, 21125, 'https://www.reddit.com/r/movies/comments/ri7eum/official_discussion_spiderman_no_way_home_spoilers/', 1639710193], ['rmf1h8', 'Official Discussion - The Matrix Resurrections [SPOILERS]', 3317, 12753, 'https://www.reddit.com/r/movies/comments/rmf1h8/official_discussion_the_matrix_resurrections/', 1640207857]]


In [22]:
new_submissions = []
set_submissions = set()

for submission in submissions_data:
    if submission[0] not in set_submissions:
        if submission[0] not in submission_ids:
            new_submissions.append(submission)
            set_submissions.add(submission[0])
        
print(len(new_submissions))

130


In [25]:
print(utils.interact_with_db(conn, "SELECT * FROM submissions LIMIT 5", "cur.fetchall()"))

[('sk2fq3', "I don't see an official discussion yet. Moonfall - wtf?", 1, 0, 'https://www.reddit.com/r/movies/comments/sk2fq3/i_dont_see_an_official_discussion_yet_moonfall_wtf/', 1643943702), ('sk2cae', "I don't see an official discussion yet. Moonfall - what the fuck?", 1, 0, 'https://www.reddit.com/r/movies/comments/sk2cae/i_dont_see_an_official_discussion_yet_moonfall/', 1643943448), ('s2n3uw', 'No official movie discussion thread for The 355?', 1, 0, 'https://www.reddit.com/r/movies/comments/s2n3uw/no_official_movie_discussion_thread_for_the_355/', 1642038292), ('rscxqu', '[Official Discussion] Spider-Man: No Way Home streaming-Reddit', 1, 1, 'https://www.reddit.com/r/HDSpiderManNoWayHome/', 1640900270), ('rlvvkr', "Official Discussion Megathread +Holiday week Discussion schedule (Nightmare Alley / Spider-Man: No Way Home / Red Rocket / The King's Man / Being the Ricardos / Swan Song)", 1, 0, 'https://www.reddit.com/r/movies/comments/rlvvkr/official_discussion_megathread_holiday_w

In [24]:
n_submission_attributes = 6

with conn:
    n_submissions = utils.add_rows(conn, 'submissions', n_submission_attributes, submissions_data)

# Using r/discussionarchive

In [36]:
archive_data = set()

for submission in reddit.subreddit('discussionarchive').new(limit=1000):
    try:
        url = re.findall("comments/([^/]+)", submission.url)[0]
        archive_data.add(url)
    except:
        break
    
# There are only 812 submissions, so it crashes after that

In [37]:
len(archive_data)

812

## Include those found using pushshift (see other notebook)

In [38]:
with open(discussionarchive_submissions_pushshift,'r') as f:
    archive_pushshift = set(f.read().splitlines())
    
print(len(archive_pushshift))

831


In [39]:
archive_data.update(archive_pushshift)

print(len(archive_data))

962


## See what is newly found

In [40]:
archive_data.difference_update(submission_ids)

print(len(archive_data))

214


In [41]:
archive_data.difference_update(set_submissions)

print(len(archive_data))

146


In [3]:
archive_data = ['11lgm8']

## Collect their submission data

In [4]:
submissions_data = []

for sub_id in tqdm(archive_data):
    try:
        submission = reddit.submission(sub_id)

        submission_list = [
                submission.id,
                submission.title,
                submission.score,
                submission.num_comments,
                submission.url,
                int(submission.created_utc)
            ]

        submissions_data.append(submission_list)
    except:
        print(f"Something went wrong with {sub_id}")

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.08s/it]


In [5]:
print(submissions_data)

[['11lgm8', '31 Days of Halloween - 10/16 - Official Discussion Thread - "Psycho"\n\n', 3, 4, 'https://www.reddit.com/r/movies/comments/11lgm8/31_days_of_halloween_1016_official_discussion/', 1350427259]]


In [52]:
len(submissions_data)

144

In [6]:
n_submission_attributes = 6

with conn:
    n_submissions = utils.add_rows(conn, 'submissions', n_submission_attributes, submissions_data)

In [7]:
utils.interact_with_db(conn, "SELECT COUNT(*) FROM submissions", "cur.fetchone()")

(2048,)

In [10]:
sql = "DELETE FROM submissions WHERE submission_id IS NULL OR trim(submission_id) = '';"

with conn:
    utils.interact_with_db(conn, sql, commit=True)

In [11]:
utils.interact_with_db(conn, "SELECT COUNT(*) FROM submissions", "cur.fetchone()")

(2048,)