# Scraping all Reddit Posts from r/ADHD_Programmers and r/adderall subreddits

In [2]:
import praw
from psaw import PushshiftAPI
import pandas as pd
import datetime as dt
import json


In [3]:
# key values needed for reddit object

secret = "redacted"
name = "redacted"
clientID = "redacted"
developerName = "redacted"

In [4]:
# Read-only instance
reddit_read_only = praw.Reddit(client_id=clientID,         # your client id
                               client_secret=secret,      # your client secret
                               user_agent=name)        # your user agent

In [5]:

# This should be either ADHD_Programers or adderall
subreddit = reddit_read_only.subreddit("ADHD_Programmers")
 
# Display the name of the Subreddit
print("Display Name:", subreddit.display_name)
 
# Display the title of the Subreddit
print("Title:", subreddit.title)
 
# Display the description of the Subreddit
print("Description:", subreddit.description)

Display Name: ADHD_Programmers
Title: Got ADHD? Program computers? Even close with either? Talk about it here. 
Description: It seems many people who program computers, or do web design, or networks, or something technically computer related also seem to have some form of r/ADHD.  We thought we'd start this subreddit as a forum to discuss techniques we use to cope, experiences we've had, etc.  Open to anyone who wants to hang out and be constructive, learn, teach, or offer a helping hand. 

This is a subclass of r/ADHD.   ;-)   All rules from the r/ADHD sub apply here. However, posts here in /r/ADHD_Programmers should lean towards or related to IT\programming.


In [11]:
submissions = set()
search_log = {}

In reddit, you can search for posts based off relevance, hot, top, new, or comments. We will do all of these, and then get the joined set

In [12]:
# We are going to get the union of posts with all of these search terms
search_types = ["relevance", "hot", "top", "new", "comments"]
time_filter = ["all", "day", "hour", "month", "week", "year"]
if subreddit.display_name == "ADHD_Programmers":
    search_terms = ["adderall", "software", "stigma", "medication", "enhancement", "stimulants", "symptom", "communication", "programming", "accommodations", "meds", "depression", "vyvanse", "antidepressants", "methylphenidate", "meth", "beer", "focus", "drug", "work", "job", "coding", "performance", "productive", "stand", "ADHD", "interview", "debugging" "bug", "software", "documentation", "test", "fired", "remote", "neurodivergence", "mental health", "learn", "website", "problem", "major", "degree", "start-up", "stress", "experience", "code", "distracting", "computer", "procrastinate", "employee", "hyperfocus", "productive", "productivity", "creativity", "email", "office", "meeting", "forget", "bootcamp", "fail"]
elif subreddit.display_name == "adderall" or subreddit.display_name == "Stims":
    search_terms = ["software", "programming", "coding", "python", "java", "code"]

In [13]:

num_run = 0
for search in search_types:
    for time in time_filter:
        for term in search_terms:
            this_search_submissions = [result for result in subreddit.search(sort=search, time_filter=time, query=term)]
            search_log[search+'_'+time+'_'+term] = this_search_submissions
            submissions = submissions.union(list(this_search_submissions))
            num_run += 1
            print(num_run, len(submissions))


1 100
2 198
3 209
4 284
5 292
6 355
7 422
8 490
9 555
10 601
11 655
12 698
13 724
14 725
15 729
16 731
17 737
18 788
19 802
20 842
21 866
22 888
23 929
24 968
25 998
26 1030
27 1055
28 1055
29 1055
30 1101
31 1131
32 1171
33 1200
34 1205
35 1226
36 1253
37 1299
38 1309
39 1346
40 1375
41 1388
42 1407
43 1416
44 1416
45 1441
46 1451
47 1486
48 1496
49 1534
50 1534
51 1543
52 1558
53 1587
54 1604
55 1619
56 1643
57 1668
58 1684
59 1684
60 1684
61 1684
62 1684
63 1684
64 1684
65 1684
66 1684
67 1684
68 1684
69 1684
70 1684
71 1684
72 1684
73 1684
74 1684
75 1684
76 1684
77 1684
78 1684
79 1684
80 1684
81 1684
82 1684
83 1684
84 1684
85 1684
86 1684
87 1684
88 1684
89 1684
90 1684
91 1684
92 1684
93 1684
94 1684
95 1684
96 1684
97 1684
98 1684
99 1684
100 1684
101 1684
102 1684
103 1684
104 1684
105 1684
106 1684
107 1684
108 1684
109 1684
110 1684
111 1684
112 1684
113 1684
114 1684
115 1684
116 1684
117 1684
118 1684
119 1684
120 1684
121 1684
122 1684
123 1684
124 1684
125 1684
126 1684

In [14]:
# Now that we have all of the relevant submissions, the goal is to get all of the comments from those submissions
full_submissions = []

def get_replies(commentForest):
    if len(commentForest) == 0:
        return []
    these_comments = []
    for comment in commentForest:
        this_comment_replies = get_replies(comment.replies)
        these_comments.append((comment.body, comment.id, comment.score, this_comment_replies))
    return these_comments

with open('r_' + subreddit.display_name + '_Posts_all.json', 'a', encoding='utf-8') as f:
    num_submissions = 0
    for submission in submissions:
        num_submissions += 1
        this_submission = {}
        this_submission['id'] = submission.id
        this_submission['title'] = submission.title
        this_submission['url'] = submission.url
        this_submission['created_utc'] = submission.created_utc
        this_submission['num_comments'] = submission.num_comments
        this_submission['num_up_votes'] = submission.score
        this_submission['is_self'] = submission.is_self
        this_submission['self_text'] = submission.selftext
        this_submission['upvote_ratio'] = submission.upvote_ratio
        this_submission['subreddit'] = submission.subreddit.display_name
        this_submission['permalink'] = submission.permalink
        print("Doing submission", num_submissions, "of", len(submissions), ": this submission has", submission.num_comments, "comments")
        submission.comments.replace_more(limit=None)
        this_submission['comments'] = get_replies(submission.comments.list())
        f.write(json.dumps(this_submission, ensure_ascii=False, indent=4))
        f.write('\n')
        f.flush()
    
    full_submissions.append(this_submission)


Doing submission 1 of 2037 : this submission has 17 comments
Doing submission 2 of 2037 : this submission has 37 comments
Doing submission 3 of 2037 : this submission has 18 comments
Doing submission 4 of 2037 : this submission has 21 comments
Doing submission 5 of 2037 : this submission has 23 comments
Doing submission 6 of 2037 : this submission has 15 comments
Doing submission 7 of 2037 : this submission has 13 comments
Doing submission 8 of 2037 : this submission has 4 comments
Doing submission 9 of 2037 : this submission has 4 comments
Doing submission 10 of 2037 : this submission has 4 comments
Doing submission 11 of 2037 : this submission has 3 comments
Doing submission 12 of 2037 : this submission has 11 comments
Doing submission 13 of 2037 : this submission has 7 comments
Doing submission 14 of 2037 : this submission has 36 comments
Doing submission 15 of 2037 : this submission has 6 comments
Doing submission 16 of 2037 : this submission has 3 comments
Doing submission 17 of 2

# Now that we have scrapped all the data, it's time to generate the csv to pick the relevant posts for our study

In [15]:
#https://stackoverflow.com/questions/27907633/how-to-extract-multiple-json-objects-from-one-file
from json import JSONDecoder, JSONDecodeError
import re

NOT_WHITESPACE = re.compile(r'[^\s]')

def decode_stacked(file_name, pos=0, decoder=JSONDecoder()):
    with open(file_name, 'r') as f:
        document = f.read()
        while True:
            match = NOT_WHITESPACE.search(document, pos)
            if not match:
                return
            pos = match.start()
            
            try:
                obj, pos = decoder.raw_decode(document, pos)
            except JSONDecodeError:
                # do something sensible if there's some error
                raise
            yield obj

In [16]:
def make_one_line(string):
    return string.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

out_csv = [["subreddit", "postID", "postTitle", "postBody", "numberOfComments", "mightBeRelevant"]]
#for json_obj in decode_stacked("r_adderall_Posts.json"):
#    out_csv.append([json_obj['subreddit'], json_obj['id'], make_one_line(json_obj['title']), make_one_line(json_obj['self_text']), json_obj['num_comments'], ''])
for json_obj in decode_stacked("r_ADHD_Programmers_Posts_all.json"):
    out_csv.append([json_obj['subreddit'], json_obj['id'], make_one_line(json_obj['title']), make_one_line(json_obj['self_text']), json_obj['num_comments'], ''])
#for json_obj in decode_stacked("r_Stims_Posts.json"):
#    out_csv.append([json_obj['subreddit'], json_obj['id'], make_one_line(json_obj['title']), make_one_line(json_obj['self_text']), json_obj['num_comments'], ''])

import csv
with open('PostIsRelevantFinal.csv', 'w') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',')
    csvwriter.writerows(out_csv)

In [22]:
print(len(submissions))

2037
