In [1]:
#import utils

from pprint import pprint
import requests
import json
import pandas as pd
import time
from tqdm import tqdm
from datetime import datetime
import os
import sys

In [2]:
analysis_dataset = 'data/comments_analysis.csv'
politics_comments_file = 'data/politics_comments.csv'

# Define search parameters
# the keys are the parameter names (see https://pushshift.io/api-parameters/ for possible parameters)
param_dict = {'metadata':'true',
              'subreddit':'politics',
              'size':0
             }

In [3]:
def get_pushshift_data(param_dict, url='https://api.pushshift.io/reddit/search/submission/?'):
    """
    Return data from the pushshift API
    Based on: https://github.com/SeyiAgboola/Reddit-Data-Mining/blob/master/Using_Pushshift_Module_to_extract_Submissions.ipynb
    :param param_dict: A dictionary with key+value pairs to feed to the API
    :param url: The URL of the pushshift API
    :return: A json object
    """
    for k, v in param_dict.items():
        url = f'{url}{k}={v}&'

    url = url[:-1]
    r = requests.get(url)
    assert r.status_code == 200, r.status_code
    data = r.json()

    return data

In [None]:
authors = set(pd.read_csv(analysis_dataset)["author"].to_list())
original_authors_length = len(authors)

In [24]:
collected_authors = set(pd.read_csv(politics_comments_file)["author"].tolist())
authors.difference_update(collected_authors)

print(original_authors_length)
print(len(authors))
print(len(collected_authors))

291415
5
291410


In [25]:
assert len(authors)+len(collected_authors) == original_authors_length

In [26]:
n_comments_out = []
authors_out = []
author_errors = set()
end_time = time.time() + 60*60*40

for i, author in enumerate(tqdm(authors)):
    param_dict['author'] = author
    
    data = None
    time.sleep(0.5) # current rate limit is 120/min according to https://api.pushshift.io/meta

    try:
        try:
            data = get_pushshift_data(param_dict, url="https://api.pushshift.io/reddit/search/comment/?")
        except AssertionError:
            first_error = sys.exc_info()[1].args[0]
            new_error = sys.exc_info()[1].args[0]

            while new_error == first_error:
                
                if time.time() > end_time:
                    time_to_stop = True
                    print(f"Finished at {datetime.now()} with user {authors_out[-1]} while the server was down")
                    break

                time.sleep(60)
                try:
                    data = get_pushshift_data(param_dict, url="https://api.pushshift.io/reddit/search/comment/?")
                    new_error = None
                except AssertionError:
                    new_error = sys.exc_info()[1].args[0]
            else:
                time_to_stop = False
            
            if time_to_stop:
                break

        if data is not None:
            authors_out.append(author)
            n_comments_out.append(data['metadata']['total_results'])
            
        if (len(n_comments_out) == 2000) or (i == len(authors)-1):
            df = pd.DataFrame({
                'author' : authors_out,
                'politics_comments' : n_comments_out
            })

            df.to_csv(politics_comments_file, mode='a', header=not os.path.exists(politics_comments_file), index=False)
            n_comments_out = []
            authors_out = []

            if time.time() > end_time:
                print(f"Finished at {datetime.now()} with user {author}")
                break

    except:
        print(f"Something went wrong at {datetime.now()} with user {author}")
        author_errors.add(author)
        
if len(n_comments_out) > 0:
    df = pd.DataFrame({
        'author' : authors_out,
        'politics_comments' : n_comments_out
    })

    df.to_csv(politics_comments_file, mode='a', header=not os.path.exists(politics_comments_file), index=False)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.07s/it]


In [20]:
print(author_errors)

set()


In [3]:
pols = pd.read_csv(politics_comments_file)

In [4]:
pols.tail()

Unnamed: 0,author,politics_comments
291410,niorec,6
291411,Doom_Marine_II,0
291412,Muouy,21
291413,wullymammith,412
291414,speakharp,0


In [19]:
comments = pd.read_csv(analysis_dataset)

In [20]:
pols = pols.convert_dtypes()
comments = comments.convert_dtypes()

In [21]:
comments = comments.merge(pols, how="left", on="author")

In [22]:
comments.to_csv(analysis_dataset, index=False)