In [1]:
from datetime import datetime as dt
import json
import os
import os.path as path
import pandas as pd
from pprint import pprint
from tqdm import tqdm

import praw
from psaw import PushshiftAPI

In [2]:
# directories
output_dir = "reddit-output"
submissions_dir = path.join(output_dir, "submissions")
comments_dir = path.join(output_dir, "comments")

# accepted datatypes (for easy export to jsonl)
ALLOWED_TYPES = set([
    type(None), bool, int, float, str, tuple, list, dict, set])

# custom functions
def make_json_formattable(dictionary):
    formatted = dictionary.copy()
    for k, v in dictionary.items():

        # force unacceptable values to be strings
        if type(v) not in ALLOWED_TYPES:
            name = str(v)
            formatted[k] = name

            # delete values that cannot be coerced into strings
            if str(v) == repr(v): 
                del formatted[k]

    return formatted

In [8]:
# setup antiwork subreddit
reddit = praw.Reddit(
    client_id="ZgEW9MV3HNYz1mbKgc_sbg", 
    client_secret="omG5BAiPP4ZSqPCR_SHm79Af8ESfXg", 
    user_agent="sub scraper")
antiwork = reddit.subreddit("antiwork")

# check remaining limit
print(reddit.auth.limits)

{'remaining': None, 'reset_timestamp': None, 'used': None}


## PRAW scraper

Pros: 
- full information (including upvotes, dates, comments)

Cons:
- rate limit is tied to karma (1000 on fresh account)

In [9]:
# constants
limit = 1000

# scrape latest submissions
filename = [
    "praw", 
    "timestamp", dt.now().strftime(format="%Y%m%d_%H%M"),
    f"limit-{limit}", 
]
outpath = path.join(submissions_dir, f"{'-'.join(filename)}.jsonl")
print(f"exporting to {outpath}")

f_out = open(outpath, "w")
for post in tqdm(antiwork.new(limit=limit)):
    data = make_json_formattable(post.__dict__)    
    print(json.dumps(data), file=f_out)
f_out.close()

# check remaining limit
print(reddit.auth.limits)

exporting to output/submissions/praw-timestamp-20220104_1232-limit-1000.jsonl
{'remaining': 290.0, 'reset_timestamp': 1641300001.6669414, 'used': 10}


In [8]:
# load scraped submissions as pandas dataframe
filepath = path.join(
    submissions_dir, "praw-timestamp-20220104_1232-limit-1000.jsonl")
with open(filepath, "r") as f:
    df = pd.read_json(f, lines=True)

# peek 
print(sorted(list(df.columns)))
display(df.head())

del df

['_comments_by_id', '_fetched', 'all_awardings', 'allow_live_comments', 'approved_at_utc', 'approved_by', 'archived', 'author', 'author_cakeday', 'author_flair_background_color', 'author_flair_css_class', 'author_flair_richtext', 'author_flair_template_id', 'author_flair_text', 'author_flair_text_color', 'author_flair_type', 'author_fullname', 'author_is_blocked', 'author_patreon_flair', 'author_premium', 'awarders', 'banned_at_utc', 'banned_by', 'can_gild', 'can_mod_post', 'category', 'clicked', 'comment_limit', 'comment_sort', 'content_categories', 'contest_mode', 'created', 'created_utc', 'crosspost_parent', 'crosspost_parent_list', 'discussion_type', 'distinguished', 'domain', 'downs', 'edited', 'gallery_data', 'gilded', 'gildings', 'hidden', 'hide_score', 'id', 'is_created_from_ads_ui', 'is_crosspostable', 'is_gallery', 'is_meta', 'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video', 'likes', 'link_flair_background_color', 'link_flair_css_cl

Unnamed: 0,comment_limit,comment_sort,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,...,post_hint,url_overridden_by_dest,preview,crosspost_parent_list,crosspost_parent,media_metadata,is_gallery,gallery_data,link_flair_template_id,author_cakeday
0,2048,confidence,,antiwork,"Hey everyone, I don't know if this fits here b...",t2_g2jxack5,False,,0,False,...,,,,,,,,,,
1,2048,confidence,,antiwork,,t2_ewfok,False,,0,False,...,image,https://i.redd.it/5yhnjvyf0o981.jpg,{'images': [{'source': {'url': 'https://previe...,,,,,,,
2,2048,confidence,,antiwork,"I'm 34 years old, I left school when I was 15 ...",t2_8h96fpkn,False,,0,False,...,,,,,,,,,,
3,2048,confidence,,antiwork,This group has made me aware of how important ...,t2_hghprp9p,False,,0,False,...,,,,,,,,,,
4,2048,confidence,,antiwork,...its 3 degrees outside and you are on break ...,t2_aamh9jeu,False,,0,False,...,,,,,,,,,,


In [21]:
# each submission has a CommentForest object, which stores comments recursively
# option 1: depth-first nested list of full comment information
# option 2: option 1 but stipulate maximum depth
# option 3: depth-first nested list of comment ids, plus a flat list of all comments from the subreddit

def format_comment(comment, id_only=True):
    if id_only:
        return {"id": comment.id}
    else:
        return make_json_formattable(comment.__dict__)

def get_replies(comment, n=None, depth=None, id_only=True):
    if depth is None or depth == 0: 
        return []

    replies = []
    for i, reply in enumerate(comment.replies):
        if n is not None and i == n: 
            break

        replies.append({
            "comment": format_comment(reply, id_only=id_only), 
            "replies": get_replies(reply, depth=depth-1, id_only=id_only)
        })

    return replies

def get_comments(submission, n=None, depth=None, id_only=True):
    # get nested list of comments for this submission 
    # up to n top-level comments, down to depth children per top-level comment, take either id_only or full information
    submission.comments.replace_more(limit=None) 

    comments = []
    for i, comment in enumerate(submission.comments):
        if n is not None and i == n: break

        comments.append({
            "comment": format_comment(comment, id_only=id_only),
            "replies": get_replies(comment, n=n, depth=depth, id_only=id_only)
        })
    return comments

In [30]:
# constants
limit = 2
n = 3
depth = 2

# scraping comments with option 2 for now
filename = [
    "praw", 
    "timestamp", dt.now().strftime(format="%Y%m%d_%H%M"),
    f"limit-{limit}", 
    f"n-{n}",
    f"depth-{depth}",
]
outpath = path.join(comments_dir, f"{'-'.join(filename)}.jsonl")
print(f"exporting to {outpath}")

f_out = open(outpath, "w")
for submission in tqdm(antiwork.controversial(limit=limit)):
    comments = get_comments(submission, n=n, depth=depth, id_only=False)
    out = {"id": submission.id, "comments": comments}
    print(json.dumps(out), file=f_out)
f_out.close()

# check rate limit
print(reddit.auth.limits)

exporting to output/comments/praw-timestamp-20220104_1303-limit-2-n-3-depth-2.jsonl


2it [04:02, 121.30s/it]

{'remaining': 120.0, 'reset_timestamp': 1641301800.4486718, 'used': 180}





In [34]:
# load scraped comments
filepath = path.join(
    comments_dir, "praw-timestamp-20220104_1303-limit-2-n-3-depth-2.jsonl")

with open(filepath, "r") as f:
    for line in f:
        comments = json.loads(line)
        pprint(comments)

{'comments': [{'comment': {'_fetched': True,
                           '_submission': 'ro1qvu',
                           'all_awardings': [],
                           'approved_at_utc': None,
                           'approved_by': None,
                           'archived': False,
                           'associated_award': None,
                           'author': 'davidducker',
                           'author_flair_background_color': None,
                           'author_flair_css_class': None,
                           'author_flair_richtext': [],
                           'author_flair_template_id': None,
                           'author_flair_text': None,
                           'author_flair_text_color': None,
                           'author_flair_type': 'text',
                           'author_fullname': 't2_139vooaj',
                           'author_is_blocked': False,
                           'author_patreon_flair': False,
                  

## Pushshift (PSAW)

Pros:

- no rate limits
- easy to scrape all posts within a specified timeframe

Cons:

- may be difficult to get depth-first comment forest

In [35]:
# pushshift setup
api = PushshiftAPI(reddit)

In [41]:
# unix timeframe
after = dt(2022, 1, 1)
before = dt.now()

# prepare outpath
filename = [
    "psaw", 
    "after", after.strftime(format="%Y%m%d"),
    "before", before.strftime(format="%Y%m%d"),
    f"limit-{limit}"
]
outpath = path.join(submissions_dir, f"{'-'.join(filename)}.jsonl")
print(f"exporting to {outpath}")

# scrape (limit) submissions within this timeframe
limit = 1000
f_out = open(outpath, "w")
submissions = api.search_submissions(
    after=int(after.timestamp()), 
    before=int(before.timestamp()),
    subreddit="antiwork",
    limit=limit,
)
for post in tqdm(antiwork.new(limit=1000)):
    data = make_json_formattable(post.__dict__)    
    print(json.dumps(data), file=f_out)

f_out.close()

# rate limit is not affected
print(reddit.auth.limits)

exporting to output/submissions/psaw-after-20220101-before-20220104-limit-1000.jsonl


In [42]:
# load scraped submissions as pandas dataframe
filepath = path.join(
    submissions_dir, "psaw-after-20220101-before-20220104-limit-1000.jsonl")
with open(filepath, "r") as f:
    df = pd.read_json(f, lines=True)

# peek
print(sorted(list(df.columns)))
display(df.head())

del df

['_comments_by_id', '_fetched', 'all_awardings', 'allow_live_comments', 'approved_at_utc', 'approved_by', 'archived', 'author', 'author_cakeday', 'author_flair_background_color', 'author_flair_css_class', 'author_flair_richtext', 'author_flair_template_id', 'author_flair_text', 'author_flair_text_color', 'author_flair_type', 'author_fullname', 'author_is_blocked', 'author_patreon_flair', 'author_premium', 'awarders', 'banned_at_utc', 'banned_by', 'can_gild', 'can_mod_post', 'category', 'clicked', 'comment_limit', 'comment_sort', 'content_categories', 'contest_mode', 'created', 'created_utc', 'crosspost_parent', 'crosspost_parent_list', 'discussion_type', 'distinguished', 'domain', 'downs', 'edited', 'gallery_data', 'gilded', 'gildings', 'hidden', 'hide_score', 'id', 'is_created_from_ads_ui', 'is_crosspostable', 'is_gallery', 'is_meta', 'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video', 'likes', 'link_flair_background_color', 'link_flair_css_cl

Unnamed: 0,comment_limit,comment_sort,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,...,post_hint,crosspost_parent_list,url_overridden_by_dest,preview,crosspost_parent,media_metadata,is_gallery,gallery_data,link_flair_template_id,author_cakeday
0,2048,confidence,,antiwork,Hey guys so this isn't the most heinous thing ...,t2_9lczq,False,,0,False,...,,,,,,,,,,
1,2048,confidence,,antiwork,My employer says I need to provide them with a...,t2_f9mwbg8k,False,,0,False,...,,,,,,,,,,
2,2048,confidence,,antiwork,I applied for a job with qualifications I have...,t2_8w1g2qpq,False,,0,False,...,,,,,,,,,,
3,2048,confidence,,antiwork,"Whenever I feel like taking a shit, I just go ...",t2_d8sm9,False,,0,False,...,,,,,,,,,,
4,2048,confidence,,antiwork,"I work at a grocery store, night crew lead, ar...",t2_vnw4hyb,False,,0,False,...,,,,,,,,,,


In [51]:
# unix timeframe
after = dt(2022, 1, 1)
before = dt.now()

# prepare outpath
filename = [
    "psaw", 
    "after", after.strftime(format="%Y%m%d"),
    "before", before.strftime(format="%Y%m%d"),
    f"limit-{limit}"
]
outpath = path.join(comments_dir, f"{'-'.join(filename)}.jsonl")
print(f"exporting to {outpath}")

# scrape (limits) most recent comments
limit = 1000
comments = api.search_comments(subreddit="antiwork", limit=limit)

f_out = open(outpath, "w")
for comment in tqdm(comments):
    out = make_json_formattable(comment.__dict__)
    print(json.dumps(out), file=f_out)
f_out.close()

exporting to output/comments/psaw-after-20220101-before-20220104-limit-1000.jsonl


1000it [01:39, 10.03it/s]


In [53]:
# load scraped comments
filepath = path.join(
    comments_dir, "psaw-after-20220101-before-20220104-limit-1000.jsonl")
with open(filepath, "r") as f:
    for i, line in enumerate(f):
        if i == 10: 
            break
        pprint(json.loads(line))

{'_fetched': True,
 '_replies': [],
 '_submission': None,
 'all_awardings': [],
 'approved_at_utc': None,
 'approved_by': None,
 'archived': False,
 'associated_award': None,
 'author': 'Mharbles',
 'author_flair_background_color': None,
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_template_id': None,
 'author_flair_text': None,
 'author_flair_text_color': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_5xj6l',
 'author_is_blocked': False,
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'banned_at_utc': None,
 'banned_by': None,
 'body': "I'm on /r/vanlife and the money some people seem to be willing to "
         'sink into a van is insane, like 5 years of rent insane. Granted '
         'these are the same people who probably have enough money to hit up a '
         'hotel when things get uncomfortable (in other words, all the time).\n'
         '\n'
         'Forced vanlife is running around in a junker in a ra