# Collecting Reddit comments

## Libraries

In [None]:
from tqdm import tqdm
from datetime import datetime
import sqlite3
import time

import utils
from params import sql_db

## Setup

In [None]:
# Define search parameters
param_dict = {'size':1000,
             }

comment_keys = ('id', 'link_id', 'body', 'author', 'score', "created_utc")

# Define comment_limit, the number of comments to be obtained
comment_limit = 250000

# Define location and name of SQL database, create a connection object
conn = sqlite3.connect(sql_db)

## Create SQL database

In [None]:
sql_create_comments_table = """CREATE TABLE IF NOT EXISTS comments (
                                comment_id text PRIMARY KEY,
                                submission_id text,
                                body text,
                                author text,
                                score integer,
                                created integer,
                                FOREIGN KEY (submission_id) REFERENCES submissions (submission_id)
                            );"""

with conn:
    utils.interact_with_db(conn, sql_create_comments_table)

## Determine for which submissions I still need to collect comments

In [None]:
# List submission IDs
# Because I didn't download all the comments in one go, the difference_update allows me to continue where I left off.  

with conn:
    submission_ids = utils.get_submission_ids(conn, 'submissions')
    sub_ids_comments = utils.get_submission_ids(conn, 'comments')

# Update difference   
submission_ids.difference_update(sub_ids_comments)
print(f"There are {len(submission_ids)} submissions for which no comments have been gathered")

In [None]:
submission_ids = {'7jwxnd'}

## Collect comments

In [None]:
print(f"Starting at {datetime.now()}")
comment_count = 0
n_comment_attributes = len(comment_keys)

with tqdm(total=comment_limit) as pbar:
    for submission_id in submission_ids:
        
        if comment_count < comment_limit:
            try:
                param_dict['link_id'] = submission_id
                #param_dict['before'] = int(time.time())
                try:
                    data = utils.get_pushshift_data(param_dict, 'https://api.pushshift.io/reddit/search/comment/?')
                except:
                    time.sleep(60)
                    data = utils.get_pushshift_data(param_dict, 'https://api.pushshift.io/reddit/search/comment/?')
                                
                while len(data) > 0: 

                    # Collect comments
                    comments_data = []
                    for comment in data:
                        try:
                            comments_data.append(utils.collect_submission_data(comment, keys=comment_keys))
                        except:
                            continue
                    #comments_data = [utils.collect_submission_data(comment, keys=comment_keys) for comment in data]

                    for comment in comments_data:
                            comment[1] = submission_id
                            
                    # Save the comments to the database
                    with conn:
                        n_comments = utils.add_rows(conn, 'comments', n_comment_attributes, comments_data)

                    # Update counter and tqdm
                    comment_count += n_comments
                    pbar.update(n_comments)               
                    
                    param_dict['before'] = comments_data[-1][-1]
                    
                    try:
                        data = utils.get_pushshift_data(param_dict, 'https://api.pushshift.io/reddit/search/comment/?')
                    except:
                        time.sleep(60)
                        data = utils.get_pushshift_data(param_dict, 'https://api.pushshift.io/reddit/search/comment/?')

            except:
                
                # Sometimes my internet cuts off for a few minutes, or some other error happens.
                # This try/except statement allows for the loop to continue.
                print(f"Something went wrong at {datetime.now()} with submission {submission_id}")
                continue
            
            
        else:
            print(f"Reached comment limit at {datetime.now()} with submission {submission_id}")
            print(f"Collected {comment_count} comments")
            break
        
print(f"Finished at {datetime.now()}")

In [None]:
utils.interact_with_db(conn, "SELECT COUNT(*) FROM comments WHERE submission_id = '7jwxnd'", "cur.fetchone()")