In [None]:
from pprint import pprint
import requests
import json
import pandas as pd
import time
from tqdm import tqdm
from datetime import datetime
import os
import sys

from utils import get_pushshift_data
from params import comments_file, politics_comments_file

In [None]:
# Define search parameters
# the keys are the parameter names (see https://pushshift.io/api-parameters/ for possible parameters)
param_dict = {'metadata':'true',
              'subreddit':'politics',
              'size':0
             }

In [None]:
authors = set(pd.read_csv(comments_file, sep = ";")["author"].to_list())
original_authors_length = len(authors)

In [None]:
collected_authors = set(pd.read_csv(politics_comments_file)["author"].tolist())
authors.difference_update(collected_authors)

print(original_authors_length) # analysis comments file: 291415
print(len(authors))
print(len(collected_authors))

In [None]:
assert len(authors)+len(collected_authors) == original_authors_length

In [None]:
n_comments_out = []
authors_out = []
author_errors = set()
end_time = time.time() + 60*60*40

for i, author in enumerate(tqdm(authors)):
    param_dict['author'] = author
    
    data = None
    time.sleep(0.5) # current rate limit is 120/min according to https://api.pushshift.io/meta

    try:
        try:
            data = get_pushshift_data(param_dict, url="https://api.pushshift.io/reddit/search/comment/?", data_only=False)
        except AssertionError:
            first_error = sys.exc_info()[1].args[0]
            new_error = sys.exc_info()[1].args[0]

            while new_error == first_error:
                
                if time.time() > end_time:
                    time_to_stop = True
                    print(f"Finished at {datetime.now()} with user {authors_out[-1]} while the server was down")
                    break

                time.sleep(60)
                try:
                    data = get_pushshift_data(param_dict, url="https://api.pushshift.io/reddit/search/comment/?", data_only=False)
                    new_error = None
                except AssertionError:
                    new_error = sys.exc_info()[1].args[0]
            else:
                time_to_stop = False
            
            if time_to_stop:
                break

        if data is not None:
            authors_out.append(author)
            n_comments_out.append(data['metadata']['total_results'])
            
        if (len(n_comments_out) == 2000) or (i == len(authors)-1):
            df = pd.DataFrame({
                'author' : authors_out,
                'politics_comments' : n_comments_out
            })

            df.to_csv(politics_comments_file, mode='a', header=not os.path.exists(politics_comments_file), index=False)
            n_comments_out = []
            authors_out = []

            if time.time() > end_time:
                print(f"Finished at {datetime.now()} with user {author}")
                break

    except:
        print(f"Something went wrong at {datetime.now()} with user {author}")
        author_errors.add(author)
        
if len(n_comments_out) > 0:
    df = pd.DataFrame({
        'author' : authors_out,
        'politics_comments' : n_comments_out
    })

    df.to_csv(politics_comments_file, mode='a', header=not os.path.exists(politics_comments_file), index=False)

In [None]:
print(author_errors)

In [None]:
pols = pd.read_csv(politics_comments_file)

In [None]:
pols.tail()