# Collecting Reddit comments

## Libraries

In [1]:
from tqdm import tqdm
from datetime import datetime
import sqlite3
import time

import utils

## Setup

In [2]:
# Define search parameters
param_dict = {'size':1000,
             }

comment_keys = ('id', 'link_id', 'body', 'author', 'score', "created_utc")

# Define comment_limit, the number of comments to be obtained
comment_limit = 250000

# Define location and name of SQL database, create a connection object
sql_db = './data/film_discussions'
conn = sqlite3.connect(sql_db)

## Create SQL database

In [3]:
sql_create_comments_table = """CREATE TABLE IF NOT EXISTS comments (
                                comment_id text PRIMARY KEY,
                                submission_id text,
                                body text,
                                author text,
                                score integer,
                                created integer,
                                FOREIGN KEY (submission_id) REFERENCES submissions (submission_id)
                            );"""

with conn:
    utils.interact_with_db(conn, sql_create_comments_table)

## Determine for which submissions I still need to collect comments

In [4]:
# List submission IDs
# Because I didn't download all the comments in one go, the difference_update allows me to continue where I left off.  

with conn:
    submission_ids = utils.get_submission_ids(conn, 'submissions')
    sub_ids_comments = utils.get_submission_ids(conn, 'comments')

# Update difference   
submission_ids.difference_update(sub_ids_comments)
print(f"There are {len(submission_ids)} submissions for which no comments have been gathered")

There are 1003 submissions for which no comments have been gathered


In [35]:
submission_ids = {'7jwxnd'}

## Collect comments

In [47]:
print(f"Starting at {datetime.now()}")
comment_count = 0
n_comment_attributes = len(comment_keys)
submission_errors = set()
data_errors = []

with tqdm(total=comment_limit) as pbar:
    for submission_id in submission_ids:
        
        if comment_count < comment_limit:
            try:
                param_dict['link_id'] = submission_id
                #param_dict['before'] = int(time.time())
                try:
                    data = utils.get_pushshift_data(param_dict, 'https://api.pushshift.io/reddit/search/comment/?')
                except:
                    time.sleep(60)
                    data = utils.get_pushshift_data(param_dict, 'https://api.pushshift.io/reddit/search/comment/?')
                                
                while len(data) > 0: 

                    # Collect comments
                    comments_data = []
                    for comment in data:
                        try:
                            comments_data.append(utils.collect_submission_data(comment, keys=comment_keys))
                        except:
                            continue
                    #comments_data = [utils.collect_submission_data(comment, keys=comment_keys) for comment in data]

                    for comment in comments_data:
                            comment[1] = submission_id
                            
                    # Save the comments to the database
                    with conn:
                        n_comments = utils.add_rows(conn, 'comments', n_comment_attributes, comments_data)

                    # Update counter and tqdm
                    comment_count += n_comments
                    pbar.update(n_comments)               
                    
                    param_dict['before'] = comments_data[-1][-1]
                    
                    try:
                        data = utils.get_pushshift_data(param_dict, 'https://api.pushshift.io/reddit/search/comment/?')
                    except:
                        time.sleep(60)
                        data = utils.get_pushshift_data(param_dict, 'https://api.pushshift.io/reddit/search/comment/?')

            except:
                
                # Sometimes my internet cuts off for a few minutes, or some other error happens.
                # This try/except statement allows for the loop to continue.
                print(f"Something went wrong at {datetime.now()} with submission {submission_id}")
                submission_errors.add(submission_id)
                continue
            
            
        else:
            print(f"Reached comment limit at {datetime.now()} with submission {submission_id}")
            print(f"Collected {comment_count} comments")
            break
        
print(f"Finished at {datetime.now()}")

with open('raised_errors_pushshift.txt','a') as f:
    for i in submission_errors:
        f.write(i)
        f.write('\n')

  0%|                                                                                       | 0/250000 [00:00<?, ?it/s]

Starting at 2022-02-16 22:21:34.566404


  2%|█▋                                                                        | 5600/250000 [04:31<3:06:01, 21.90it/s]

zzz @ 2022-02-16 22:26:19.016870


  4%|██▋                                                                       | 8899/250000 [08:39<3:01:02, 22.20it/s]

zzz @ 2022-02-16 22:30:14.628548


  5%|███▍                                                                     | 11699/250000 [11:50<3:01:18, 21.91it/s]

zzz @ 2022-02-16 22:33:36.308654


  5%|███▌                                                                     | 12099/250000 [13:29<8:07:00,  8.14it/s]

zzz @ 2022-02-16 22:35:05.046037


  9%|██████▋                                                                  | 22896/250000 [25:35<3:32:09, 17.84it/s]

zzz @ 2022-02-16 22:47:21.460709


 10%|███████                                                                  | 23995/250000 [27:32<3:22:54, 18.56it/s]

zzz @ 2022-02-16 22:49:23.112501


 10%|███████▍                                                                 | 25595/250000 [30:07<3:42:47, 16.79it/s]

zzz @ 2022-02-16 22:51:53.063285


 12%|████████▊                                                                | 30194/250000 [35:27<3:15:45, 18.71it/s]

zzz @ 2022-02-16 22:57:14.713691


 12%|████████▉                                                                | 30594/250000 [37:10<7:31:14,  8.10it/s]

zzz @ 2022-02-16 22:58:50.237087


 14%|██████████▌                                                              | 36191/250000 [43:26<3:28:31, 17.09it/s]

zzz @ 2022-02-16 23:05:12.776329


 16%|███████████▌                                                             | 39788/250000 [47:08<2:00:00, 29.19it/s]

zzz @ 2022-02-16 23:08:54.839196


 18%|█████████████▍                                                           | 45988/250000 [53:35<3:39:39, 15.48it/s]

zzz @ 2022-02-16 23:15:21.524078


 24%|████████████████▋                                                      | 58878/250000 [1:04:03<2:42:59, 19.54it/s]

zzz @ 2022-02-16 23:25:50.637557


 24%|█████████████████▏                                                     | 60578/250000 [1:06:49<2:47:52, 18.81it/s]

zzz @ 2022-02-16 23:28:35.712021


 25%|█████████████████▍                                                     | 61278/250000 [1:08:37<4:02:24, 12.98it/s]

zzz @ 2022-02-16 23:30:24.159702


 26%|██████████████████▋                                                    | 65875/250000 [1:13:19<2:50:34, 17.99it/s]

zzz @ 2022-02-16 23:35:05.678161


 29%|████████████████████▎                                                  | 71667/250000 [1:18:39<2:32:36, 19.48it/s]

zzz @ 2022-02-16 23:40:25.151159


 30%|█████████████████████▍                                                 | 75665/250000 [1:22:59<2:43:08, 17.81it/s]

zzz @ 2022-02-16 23:44:45.408328


 32%|██████████████████████▍                                                | 78847/250000 [1:26:56<3:08:43, 15.12it/s]

Finished at 2022-02-16 23:48:30.979095





In [48]:
utils.interact_with_db(conn, "SELECT COUNT(*) FROM comments WHERE submission_id = '7jwxnd'", "cur.fetchone()")

(95940,)

In [26]:
# utils.interact_with_db(conn, "DROP TABLE comments")

In [None]:
conn.close()