# Classification of the /r/ADHD and /r/Anxiety SubReddits

### Importing the subreddits through the PushShift API

In [16]:
import requests, time, csv, json, re
import pandas as pd

In [17]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [18]:
params = {'subreddit':'adhd,anxiety',
          'size':2000,
         }

In [19]:
response = requests.get(url, params=params)

In [20]:
response.status_code

200

In [22]:
response.json()['data'][0]

{'author': 'PupperinoSuprimo',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_2qy0fny9',
 'author_patreon_flair': False,
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1554338785,
 'domain': 'self.Anxiety',
 'full_link': 'https://www.reddit.com/r/Anxiety/comments/b964c6/if_youll_never_really_know_let_that_damn_thought/',
 'gildings': {'gid_1': 0, 'gid_2': 0, 'gid_3': 0},
 'id': 'b964c6',
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': False,
 'no_follow': True,
 'num_comments': 0,
 'num_crossposts': 0,
 'over_18': False,
 'parent_whitelist_status': 'house_only',
 'permalink': '/r/Anxiety/c

Creating a list of the data I want to save

In [23]:
col_list = ['author',
            'subreddit',
            'title',
            'selftext',
            'created_utc',
            'retrieved_on',
            'url',
            'pinned',
            'media_only'
            ]

Defining a function here that will automatically save the data with a unique filename, in case I download again in the future and don't want to overwrite my existing data.

In [24]:
def filename_format_log(file_path, 
                        logfile = './assets/file_log.txt', 
                        now = round(time.time()), 
                        file_description = None): 
   
    try:
        ext = re.search('(?<!^)(?<!\.)\.(?!\.)', file_path).start() 
    except:
        raise NameError('Please enter a relative path with a file extension.') 
    
    stamp = re.search('(?<!^)(?<!\.)[a-z]+_[a-z]+(?=\.)', file_path).start()
    formatted_name = f'{file_path[:stamp]}{now}_{file_path[stamp:]}'  
    if not file_description:
        file_description = f'Pull: {time.asctime(time.gmtime(now))}'
    with open(logfile, 'a+') as f:
        f.write(f'{formatted_name}: {file_description}\n')
    return formatted_name, now, file_description

Defining a custom function to run my subreddit queries, show the status and progress as it happens, and automatically write the data into a corresponding .json file using my previously defined custom function from above.

In [55]:
def reddit_query(subreddits, n_samples=1500, before=None, after=None):
    url = f'https://api.pushshift.io/reddit/search/submission'
    last_comment = round(time.time())
    comment_list = []
    
    run = 1
    while len(comment_list) < n_samples:
        
        try:
            print(f'Starting query {run}')
            
            params = {
              'subreddit':subreddits,
              'sort':'desc',
              'size':n_samples,
              'before':last_comment-1,
              'after':after,
             }
                
            response = requests.get(url, params = params)
            posts = response.json()['data']
            
            if len(posts) == 0:
                last_comment = last_comment
            else:
                last_comment = posts[-1]['created_utc']
                comment_list.extend(posts)
                timestamp = posts[-1]['created_utc']
                time.sleep(1) 
                run += 1
        except:
            if response.status_code != 200:
                return f'Check status. Error code: {response.status_code}'
            else:
                return 'Error. Pull not completed.'
    
    formatted_name, now, file_description = filename_format_log(file_path =f'./data/raw_{subreddits}.json', now=timestamp)
    with open(formatted_name, 'w+') as f:
        json.dump(comment_list, f)
    
    print(f'Saved and completed query and returned {len(comment_list)} submissions.')
    print(f'Reddit text is ready for processing.')
    return print(f'Last timestamp was {timestamp}.')

Running the query for the ADHD subreddit

In [56]:
reddit_query(subreddits='adhd', n_samples=2000)

Starting query 1
Starting query 2
Saved and completed query and returned 2000 submissions.
Reddit text is ready for processing.
Last timestamp was 1553533645.


Loading the saved .json file

In [57]:
with open(f'./data/1553533645_raw_adhd.json', 'r') as f:
    adhd_list = json.load(f)

In [58]:
len(adhd_list)

2000

Checking to see if the data came through

In [59]:
adhd_list[0]

{'author': 'muchomuchomaas',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_8aktb',
 'author_patreon_flair': False,
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1554340205,
 'domain': 'self.ADHD',
 'full_link': 'https://www.reddit.com/r/ADHD/comments/b96cun/just_starting_meds/',
 'gildings': {'gid_1': 0, 'gid_2': 0, 'gid_3': 0},
 'id': 'b96cun',
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': False,
 'no_follow': True,
 'num_comments': 0,
 'num_crossposts': 0,
 'over_18': False,
 'parent_whitelist_status': 'all_ads',
 'permalink': '/r/ADHD/comments/b96cun/just_starting_meds/',
 'pinned':

Defining a custom function to automatically parse my data so that it only keeps what I want and makes it ready to be turned into a Pandas DataFrame.

In [60]:
def reddit_parse(sample):
    
    col_list = ['author',
            'subreddit',
            'title',
            'selftext',
            'created_utc',
            'retrieved_on',
            'url',
            'pinned',
            'media_only'
            ]
    
    posts_df = pd.DataFrame(sample)
    posts_df = posts_df[col_list]
    
    posts_df.rename(columns={'subreddit':'adhd'}, inplace=True)
    posts_df['adhd'] = posts_df['adhd'].map({'ADHD':1, 'Anxiety':0})
    
    col_order = ['author',
            'adhd',
            'title',
            'selftext',
            'created_utc',
            'retrieved_on',
            'url',
            'pinned',
            'media_only'
            ]

    return posts_df[col_order]

In [61]:
adhd_df = reddit_parse(adhd_list)

In [62]:
adhd_df.adhd.value_counts()

1    2000
Name: adhd, dtype: int64

In [63]:
adhd_df.head()

Unnamed: 0,author,adhd,title,selftext,created_utc,retrieved_on,url,pinned,media_only
0,muchomuchomaas,1,Just starting meds,So I saw my psychiatrist yesterday for the fir...,1554340205,1554340206,https://www.reddit.com/r/ADHD/comments/b96cun/...,False,False
1,RedMooseBlueMoose,1,I CANT FUCKING SLEEP,,1554339679,1554339679,https://www.reddit.com/r/ADHD/comments/b969qf/...,False,False
2,lampdude,1,More than 7 hours of sleep will really fuck up...,If I get 4-6 hours of sleep I am far more prod...,1554339423,1554339424,https://www.reddit.com/r/ADHD/comments/b96846/...,False,False
3,miljou,1,I (21F) just started Adderall -- Some question...,"Heya, I just got started on Adderall IR 10mg u...",1554339377,1554339378,https://www.reddit.com/r/ADHD/comments/b967vk/...,False,False
4,Chrischticks,1,Untreated ADHD at 20 years old,Spent most of my life as the class clown and t...,1554339254,1554339255,https://www.reddit.com/r/ADHD/comments/b96755/...,False,False


---

Repeating the above for the Anxiety subreddit.

In [64]:
reddit_query(subreddits='anxiety', n_samples=2000)

Starting query 1
Starting query 2
Saved and completed query and returned 2000 submissions.
Reddit text is ready for processing.
Last timestamp was 1553554769.


In [65]:
with open(f'./data/1553554769_raw_anxiety.json', 'r') as f:
    anxiety_list = json.load(f)

In [66]:
len(anxiety_list)

2000

In [67]:
anxiety_list[0]

{'author': 'ZorroNegro',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_b769nm6',
 'author_patreon_flair': False,
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1554340190,
 'domain': 'self.Anxiety',
 'full_link': 'https://www.reddit.com/r/Anxiety/comments/b96cqu/please_i_need_help/',
 'gildings': {'gid_1': 0, 'gid_2': 0, 'gid_3': 0},
 'id': 'b96cqu',
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_css_class': 'helpflair',
 'link_flair_richtext': [],
 'link_flair_text': 'Needs A Hug/Support',
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': False,
 'no_follow': True,
 'num_comments': 0,
 'num_crossposts': 0,
 'over_18': False,
 'parent_whitelist_status

In [70]:
def reddit_parse(sample):
    
    col_list = ['author',
            'subreddit',
            'title',
            'selftext',
            'created_utc',
            'retrieved_on',
            'url',
            'pinned',
            'media_only'
            ]
    
    posts_df = pd.DataFrame(sample)
    posts_df = posts_df[col_list]
    
    posts_df.rename(columns={'subreddit':'anxiety'}, inplace=True)
    posts_df['anxiety'] = posts_df['anxiety'].map({'Anxiety':1, 'ADHD':0})
    
    col_order = ['author',
            'anxiety',
            'title',
            'selftext',
            'created_utc',
            'retrieved_on',
            'url',
            'pinned',
            'media_only'
            ]

    return posts_df[col_order]

In [71]:
anxiety_df = reddit_parse(anxiety_list)

In [72]:
anxiety_df.anxiety.value_counts()

1    2000
Name: anxiety, dtype: int64

In [73]:
anxiety_df.head()

Unnamed: 0,author,anxiety,title,selftext,created_utc,retrieved_on,url,pinned,media_only
0,ZorroNegro,1,"Please, I need help",My friend asked me to be his best man for his ...,1554340190,1554340191,https://www.reddit.com/r/Anxiety/comments/b96c...,False,False
1,PupperinoSuprimo,1,"If you'll never really know, let that damn tho...","Lately, this has become a sort of mantra I've ...",1554338785,1554338785,https://www.reddit.com/r/Anxiety/comments/b964...,False,False
2,moake740,1,Feeling like such a failure,"I suck, I haven’t been taking my meds a prescr...",1554338572,1554338573,https://www.reddit.com/r/Anxiety/comments/b963...,False,False
3,ketchuptiles,1,Rolling anxiety attacks?,"Hey everyone,\n\nI could really use your colle...",1554338312,1554338313,https://www.reddit.com/r/Anxiety/comments/b961...,False,False
4,CrazyTayTay,1,I have a serious problem with chewing my cutic...,,1554338074,1554338076,https://i.redd.it/904ci09l75q21.jpg,False,False


---

Merging both dataframes into one for easier cleaning

In [74]:
df = pd.merge(anxiety_df, adhd_df, how='outer')

In [94]:
df.isna().sum()

author          0
anxiety         0
title           0
selftext        0
created_utc     0
retrieved_on    0
url             0
pinned          0
media_only      0
adhd            0
dtype: int64

In [78]:
df.fillna(0, inplace=True)

Checking for and dropping duplicates

In [90]:
duplicates = df.duplicated(subset=['title', 'author'])

In [91]:
duplicates.value_counts()

False    3988
True       12
dtype: int64

Looks like we only have 12 duplicates, so going to drop those

In [92]:
df.drop_duplicates(subset=['title', 'author'], inplace=True)

Saving out to a do more rigorous cleaning in a separate notebook

In [93]:
df.to_csv('./data/clean.csv')