# Collecting Reddit comments

## Libraries

In [1]:
import praw
from tqdm import tqdm
from datetime import datetime
import sqlite3

import utils

## Setup

In [19]:
# Define search parameters
comment_attributes = ('comment.id',
                      'submission.id',
                      'comment.body',
                      'str(comment.author)', # need to convert author to string, otherwise it's a class instance.
                      'comment.score',
                      'comment.created_utc')

# Define comment_limit, the number of comments to be obtained
comment_limit = 3000000

# Define location and name of SQL database, create a connection object
sql_db = './data/film_discussions'
conn = sqlite3.connect(sql_db)

# Creating a Reddit-instance in PRAW with my personal Reddit username, password etc.
# Before handing in the project I removed the praw.ini file from this folder, which is why it now gives an error message
# See: https://praw.readthedocs.io/en/latest/getting_started/configuration/prawini.html
reddit_praw_id = "Jarik"
reddit = praw.Reddit(reddit_praw_id)

## Create SQL database

In [3]:
sql_create_comments_table = """CREATE TABLE IF NOT EXISTS comments (
                                comment_id text PRIMARY KEY,
                                submission_id text,
                                body text,
                                author text,
                                score integer,
                                created integer,
                                FOREIGN KEY (submission_id) REFERENCES submissions (submission_id)
                            );"""

with conn:
    utils.interact_with_db(conn, sql_create_comments_table)

## Determine for which submissions I still need to collect comments

In [20]:
# List submission IDs
# Because I didn't download all the comments in one go, the difference_update allows me to continue where I left off.  

with conn:
    submission_ids = utils.get_submission_ids(conn, 'submissions')
    sub_ids_comments = utils.get_submission_ids(conn, 'comments')

# Update difference   
submission_ids.difference_update(sub_ids_comments)
print(f"There are {len(submission_ids)} submissions for which no comments have been gathered")

There are 466 submissions for which no comments have been gathered


In [48]:
#DEL
# Selecting the first 700 submissions (or do submsissions less than 6 months old?)

with conn:
    submission_ids = utils.get_submission_ids(conn, 'submissions LIMIT 300 OFFSET 700') # WHERE created > 1627839158

In [42]:
#DEL
len(submission_ids)

700

In [46]:
#DEL
len(submission_ids)

## Collect comments

In [45]:
print(f"Starting at {datetime.now()}")
comment_count = 0
n_comment_attributes = len(comment_attributes)
submission_errors = set()

for submission_id in tqdm(submission_ids):

    if comment_count < comment_limit:
        try:
            # Collect comments
            comments_data = utils.get_comments_data(reddit, submission_id, comment_attributes=comment_attributes)

            # Save the comments to the database
            with conn:
                n_comments = utils.add_rows(conn, 'comments', n_comment_attributes, comments_data)

            # Update counter and tqdm
            comment_count += n_comments
            #pbar.update(n_comments)

        except:
            # Sometimes my internet cuts off for a few minutes, or some other error happens.
            # This try/except statement allows for the loop to continue.
            # print(f"Something went wrong at {datetime.now()} with submission {submission_id}")
            submission_errors.add(submission_id)
            continue
    else:
        print(f"Reached comment limit at {datetime.now()} with submission {submission_id}")
        print(f"Collected {comment_count} comments")
        break
        
with open('raised_errors.txt','a') as f:
    for i in submission_errors:
        f.write(i)
        f.write('\n')        
        
print(f"Finished at {datetime.now()}")

  0%|                                                                                          | 0/512 [00:00<?, ?it/s]

Starting at 2022-02-13 20:16:51.523966


100%|█████████████████████████████████████████████████████████████████████████████| 512/512 [10:04:59<00:00, 70.90s/it]

Finished at 2022-02-14 06:21:51.252218





In [53]:
print(comment_count)
print(submission_errors)

108293
set()


## Find comments for submissions that raised errors or were in the first 700

In [3]:
with open('raised_errors_pushshift.txt','r') as f:
    errors = f.read()
    
errors = set(errors.splitlines())

In [4]:
len(errors)

416

In [49]:
with open("first700.txt", "r") as f:
    first = set(f.read().splitlines())

In [50]:
submission_ids.difference_update(errors)

In [51]:
submission_ids.difference_update(first)

In [None]:
# now run 'Collect comments' again
# for 701-1000

In [52]:
print(f"Starting at {datetime.now()}")
comment_count = 0
n_comment_attributes = len(comment_attributes)
submission_errors = set()

for submission_id in tqdm(submission_ids):

    if comment_count < comment_limit:
        try:
            # Collect comments
            comments_data = utils.get_comments_data(reddit, submission_id, comment_attributes=comment_attributes)

            # Save the comments to the database
            with conn:
                n_comments = utils.add_rows(conn, 'comments', n_comment_attributes, comments_data)

            # Update counter and tqdm
            comment_count += n_comments
            #pbar.update(n_comments)

        except:
            # Sometimes my internet cuts off for a few minutes, or some other error happens.
            # This try/except statement allows for the loop to continue.
            # print(f"Something went wrong at {datetime.now()} with submission {submission_id}")
            submission_errors.add(submission_id)
            continue
    else:
        print(f"Reached comment limit at {datetime.now()} with submission {submission_id}")
        print(f"Collected {comment_count} comments")
        break
        
with open('raised_errors.txt','a') as f:
    for i in submission_errors:
        f.write(i)
        f.write('\n')        
        
print(f"Finished at {datetime.now()}")

  0%|                                                                                          | 0/241 [00:00<?, ?it/s]

Starting at 2022-02-14 06:21:51.398166


100%|██████████████████████████████████████████████████████████████████████████████| 241/241 [4:19:06<00:00, 64.51s/it]

Finished at 2022-02-14 10:40:57.600850





In [7]:
zero_comments = utils.interact_with_db(conn, "SELECT * FROM submissions where num_comments=0", "cur.fetchall()")

In [9]:
zero_set = set()
for i in zero_comments:
    zero_set.add(i[0])

In [10]:
submission_ids.difference_update(zero_set)

In [11]:
len(submission_ids)

350

In [41]:
len(zero_set)

487

In [42]:
zero_set.difference_update(submission_ids)

In [47]:
len(submission_ids)

4

In [48]:
submission_ids

{'2pa6za', '372yx7', '37411h', '3u21u6'}

In [53]:
print(utils.interact_with_db(conn, "SELECT * FROM submissions where submission_id='372yx7'", "cur.fetchall()"))

[('372yx7', "Official Discussion: Marvel's The Avengers 2: Age Of Ultron", 1, 1, 'http://www.reddit.com/r/movies/comments/372yx7/official_discussion_marvels_the_avengers_2_age_of/', 1432466525)]


In [58]:
print(utils.interact_with_db(conn, "SELECT * FROM comments where submission_id='372yx7'", "cur.fetchall()"))

[]


In [57]:
print(utils.interact_with_db(conn, "SELECT COUNT(*) FROM comments", "cur.fetchone()"))

(967608,)


In [13]:
print(utils.interact_with_db(conn, "SELECT COUNT(*) FROM comments", "cur.fetchone()"))

(1568971,)
