# Collecting Reddit submissions

## Libraries

In [28]:
import praw
from tqdm import tqdm
from datetime import datetime
import sqlite3
import regex as re

import utils

In [63]:
print(utils.interact_with_db(conn, "SELECT COUNT(*) FROM comments", "cur.fetchone()"))

(1612624,)


In [69]:
print(utils.interact_with_db(conn, "SELECT COUNT(*) FROM comments", "cur.fetchone()"))

(1614398,)


In [64]:
1612624-1568971

43653

In [70]:
1612624-1614398

-1774

## Setup

In [21]:
# Define location and name of SQL database, create a connection object
sql_db = './data/film_discussions'
conn = sqlite3.connect(sql_db)

# Creating a Reddit-instance in PRAW with my personal Reddit username, password etc.
# Before handing in the project I removed the praw.ini file from this folder, which is why it now gives an error message
# See: https://praw.readthedocs.io/en/latest/getting_started/configuration/prawini.html
reddit_praw_id = "Jarik"
reddit = praw.Reddit(reddit_praw_id)

discussionarchive_submissions_pushshift = 'data/discussionarchive_submissions_pushshift.txt'

Version 7.2.0 of praw is outdated. Version 7.5.0 was released Sunday November 14, 2021.


In [71]:
with conn:
    submission_ids = utils.get_submission_ids(conn, 'submissions')

## Collect submissions

In [None]:
print(f"Starting at {datetime.now()}")
submissions_data = []

for sort_type in tqdm(["relevance", "hot", "top", "new", "comments"]):
    for submission in reddit.subreddit('movies').search("Official Discussion", sort=sort_type, limit=1000):

        submission_dict = {
            "id": submission.id,
            "title": submission.title,
            "score": submission.score,
            "num_comments": submission.num_comments,
            "created": submission.created_utc
        }

        submissions_data.append(submission_dict)

In [None]:
print(submissions_data[:2])

In [None]:
new_submissions = []
set_submissions = set()

for submission in submissions_data:
    if submission['id'] not in set_submissions:
        if submission['id'] not in submission_ids:
            new_submissions.append(submission)
            set_submissions.add(submission['id'])
        
print(len(new_submissions))

In [None]:
n_submission_attributes = 5

with conn:
    n_submissions = utils.add_rows(conn, 'submissions', n_submission_attributes, submissions_data)

# Using r/discussionarchive

In [None]:
archive_data = set()

for submission in reddit.subreddit('discussionarchive').new(limit=1000):
    url = re.findall("comments/([^/]+)", submission.url)[0]
    archive_data.add(url)
    
print(len(archive_data))

## See what is newly found

In [None]:
archive_data.difference_update(submission_ids)

print(len(archive_data))

In [None]:
archive_data.difference_update(set_submissions)

print(len(archive_data))

## Include those found using pushshift (see other notebook)

In [None]:
with open(discussionarchive_submissions_pushshift,'r') as f:
    archive_pushshift = set(f.read().splitlines())
    
print(len(archive_pushshift))

In [None]:
archive_data.update(archive_pushshift)

print(len(archive_data))

## Collect their submission data

In [None]:
submissions_data = []

for sub_id in archive_data:
    submission = reddit.submission(sub_id)

    submission_dict = {
        "id": submission.id,
        "title": submission.title,
        "score": submission.score,
        "num_comments": submission.num_comments,
        "created": submission.created_utc
    }

    submissions_data.append(submission_dict)

In [None]:
print(submissions_data[:2])

## Store submissions

In [None]:
n_submission_attributes = 5

with conn:
    n_submissions = utils.add_rows(conn, 'submissions', n_submission_attributes, submissions_data)