# Collecting Reddit comments

## Libraries

In [1]:
import praw
from tqdm import tqdm
from datetime import datetime
import sqlite3

import utils

In [81]:
import importlib
importlib.reload(utils)

<module 'utils' from 'D:\\Python\\Thesis\\utils.py'>

## Setup

In [77]:
# Define search parameters
comment_attributes = ('comment.id',
                      'submission.id',
                      'comment.body',
                      'str(comment.author)', # need to convert author to string, otherwise it's a class instance.
                      'comment.score',
                      'comment.created_utc')

# Define comment_limit, the number of comments to be obtained
comment_limit = 1200000

# Define location and name of SQL database, create a connection object
sql_db = './data/film_discussions'
conn = sqlite3.connect(sql_db)

# Creating a Reddit-instance in PRAW with my personal Reddit username, password etc.
# Before handing in the project I removed the praw.ini file from this folder, which is why it now gives an error message
# See: https://praw.readthedocs.io/en/latest/getting_started/configuration/prawini.html
reddit_praw_id = "Jarik"
reddit = praw.Reddit(reddit_praw_id)

## Create SQL database

In [3]:
sql_create_comments_table = """CREATE TABLE IF NOT EXISTS comments (
                                comment_id text PRIMARY KEY,
                                submission_id text,
                                body text,
                                author text,
                                score integer,
                                created integer,
                                FOREIGN KEY (submission_id) REFERENCES submissions (submission_id)
                            );"""

with conn:
    utils.interact_with_db(conn, sql_create_comments_table)

## Determine for which submissions I still need to collect comments

In [62]:
# List submission IDs
# Because I didn't download all the comments in one go, the difference_update allows me to continue where I left off.  

with conn:
    submission_ids = utils.get_submission_ids(conn, 'submissions')
    sub_ids_comments = utils.get_submission_ids(conn, 'comments')

# Update difference   
submission_ids.difference_update(sub_ids_comments)
print(f"There are {len(submission_ids)} submissions for which no comments have been gathered")

There are 481 submissions for which no comments have been gathered


## Collect comments

In [92]:
print(f"Starting at {datetime.now()}")
comment_count = 0
n_comment_attributes = len(comment_attributes)
submission_errors = set()

with tqdm(total=comment_limit) as pbar:
    for submission_id in submission_ids:

        if comment_count < comment_limit:
            try:
                # Collect comments
                comments_data = utils.get_comments_data(reddit, submission_id, comment_attributes=comment_attributes)

                # Save the comments to the database
                with conn:
                    n_comments = utils.add_rows(conn, 'comments', n_comment_attributes, comments_data)

                # Update counter and tqdm
                comment_count += n_comments
                pbar.update(n_comments)

            except:
                # Sometimes my internet cuts off for a few minutes, or some other error happens.
                # This try/except statement allows for the loop to continue.
                # print(f"Something went wrong at {datetime.now()} with submission {submission_id}")
                submission_errors.add(submission_id)
                continue
        else:
            print(f"Reached comment limit at {datetime.now()} with submission {submission_id}")
            print(f"Collected {comment_count} comments")
            break
        
with open('raised_errors.txt','a') as f:
    for i in submission_errors:
        f.write(i)
        f.write('\n')        
        
print(comment_count)
print(submission_errors)
print(f"Finished at {datetime.now()}")

  0%|                                                                                      | 0/1200000 [00:00<?, ?it/s]

Starting at 2022-02-15 23:58:47.740175


  7%|████▊                                                              | 86615/1200000 [19:12:58<247:00:54,  1.25it/s]

86615
{'bgnl7y', '7jwxnd', '1v2iqp', 'nx3jdq'}
Finished at 2022-02-16 19:11:46.681704





## Submissions that raised errors

In [82]:
with open('raised_errors.txt','r') as f:
    errors = f.read()
    
errors = set(errors.splitlines())

# Manual inspection found that these threads had been removed or contained no comments
removed = {'ft2qny', 'cu3hzi', 'gkfeb1', 'ncnw25', 'c7xnp5', 'qd6y5i', 'ri7gfm'}

errors.difference_update(removed)

# That raised errors a second time
errors = {'bgnl7y', '7jwxnd', '1v2iqp', 'nx3jdq'}

print(len(errors))



12


In [89]:
submission_ids = errors

In [93]:
print(utils.interact_with_db(conn, "SELECT COUNT(*) FROM comments", "cur.fetchone()"))

(2295439,)


In [94]:
print(utils.interact_with_db(conn, "SELECT COUNT(*) FROM comments", "cur.fetchone()"))

(2409188,)
