# Collecting Reddit comments

## Libraries

In [1]:
from tqdm import tqdm
from datetime import datetime
import sqlite3
import time

import utils

## Setup

In [2]:
# Define search parameters
param_dict = {'size':1000,
             }

comment_keys = ('id', 'link_id', 'body', 'author', 'score', "created_utc")

# Define comment_limit, the number of comments to be obtained
comment_limit = 10000

# Define location and name of SQL database, create a connection object
sql_db = './data/film_discussions'
conn = sqlite3.connect(sql_db)

## Create SQL database

In [3]:
sql_create_comments_table = """CREATE TABLE IF NOT EXISTS comments (
                                comment_id text PRIMARY KEY,
                                submission_id text,
                                body text,
                                author text,
                                score integer,
                                created integer,
                                FOREIGN KEY (submission_id) REFERENCES submissions (submission_id)
                            );"""

with conn:
    utils.interact_with_db(conn, sql_create_comments_table)

## Determine for which submissions I still need to collect comments

In [4]:
# List submission IDs
# Because I didn't download all the comments in one go, the difference_update allows me to continue where I left off.  

with conn:
    submission_ids = utils.get_submission_ids(conn, 'submissions')
    sub_ids_comments = utils.get_submission_ids(conn, 'comments')

# Update difference   
submission_ids.difference_update(sub_ids_comments)
print(f"There are {len(submission_ids)} submissions for which no comments have been gathered")

There are 1003 submissions for which no comments have been gathered


## Collect comments

In [6]:
print(f"Starting at {datetime.now()}")
comment_count = 0
n_comment_attributes = len(comment_keys)
submission_errors = set()

with tqdm(total=comment_limit) as pbar:
    for submission_id in submission_ids:
        
        if comment_count < comment_limit:
            try:
                param_dict['link_id'] = submission_id
                param_dict['before'] = int(time.time())
                data = utils.get_pushshift_data(param_dict, 'https://api.pushshift.io/reddit/search/comment/?')
                                
                while len(data) > 0: 

                    # Collect comments
                    comments_data = [utils.collect_submission_data(comment, keys=comment_keys) for comment in data]
                    
                    print("1")
                    
                    for comment in comments_data:
                        comment[1] = comment[1][3:]
                        
                    print("2")

                    # Save the comments to the database
                    with conn:
                        n_comments = utils.add_rows(conn, 'comments', n_comment_attributes, comments_data)

                    # Update counter and tqdm
                    comment_count += n_comments
                    pbar.update(n_comments)
                    
                    
                    param_dict['before'] = data[-1]['created_utc']
                    data = utils.get_pushshift_data(param_dict, 'https://api.pushshift.io/reddit/search/comment/?')

            except:
                # Sometimes my internet cuts off for a few minutes, or some other error happens.
                # This try/except statement allows for the loop to continue.
                print(f"Something went wrong at {datetime.now()} with submission {submission_id}")
                submission_errors.add(submission_id)
                continue
            
            
        else:
            print(f"Reached comment limit at {datetime.now()} with submission {submission_id}")
            print(f"Collected {comment_count} comments")
            break
        
print(f"Finished at {datetime.now()}")

with open('raised_errors.txt','a') as f:
    for i in submission_errors:
        f.write(i)
        f.write('\n')

  0%|                                                                                        | 0/10000 [00:00<?, ?it/s]

Starting at 2022-02-10 22:00:14.891162


  0%|                                                                                        | 0/10000 [00:04<?, ?it/s]

Finished at 2022-02-10 22:00:19.616717





In [6]:
utils.interact_with_db(conn, "SELECT * FROM comments LIMIT 10", "cur.fetchall()")

[('e0xm987',
  '7llz2i',
  "I think she always had gills. She's mute, she was found near water, she's dreaming about being underwater, she likes eggs(high protein diet), she's instantly attracted to the asset, she kinda controls water. Yeah.",
  'DanteR12',
  1,
  1529422643),
 ('e0ueq8b',
  '7llz2i',
  'https://youtu.be/nnZf05RSOww',
  'KnellerGreg',
  1,
  1529279533),
 ('e0r6q9y',
  '7llz2i',
  "ADDITIONALLY-- thank you so much for caring enough to ask, in a genuinely curious and respectful way. I'm curious to hear your thoughts!",
  'e-lutris',
  1,
  1529119335),
 ('e0r6ohz',
  '7llz2i',
  'I would suggest you watch the scene again if you are so inclined.  The movie hinted in an *earlier* scene that his fingers were already beginning to rot, when he was in the car and sniffed his fingers that were turning black. Not bleeding (color=red), but black (rotting), and he winces at the smell of his hand. Smell=infected/rotting. \n\nYes, it was the wife who instigated (consent ✓), but the

In [26]:
# utils.interact_with_db(conn, "DROP TABLE comments")

In [None]:
conn.close()