In [1]:
# Import Dependencies
import praw
import pandas as pd
import re
from tqdm.notebook import tqdm
from time import sleep

In [50]:
# Loading Reddit API
reddit = praw.Reddit(client_id='-----',
                     client_secret='-----',
                     user_agent='-----',)
reddit.read_only = True

In [37]:
# Loading functions needed for extracting and cleaning data
# Defining function to extract reddit data
def reddit_extractor(display_name, num_top_posts):
    submission_info = []
    subreddit = reddit.subreddit(str(display_name))
    for submission_post in tqdm(subreddit.top(limit=num_top_posts)):
        submission = reddit.submission(id=submission_post.id)
        submission.comment_sort = "top"
        for comment in submission.comments[:2]:
            submission_info.append([submission_post.id, submission_post.selftext, 
                                    comment.body, comment.score])
        sleep(0.01)
    submission_columns = ['Post_ID', 'Post', 'Comment', 'Score']
    submission_comment_df = pd.DataFrame(submission_info, columns=submission_columns)
    return submission_comment_df

# Defining function to remove emojis/icons/symbols from text
def deEmojify(text):
    """Function removes emojis and other icons/symbols from text, see
    https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python"""
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def reddit_data_cleaner(data):
    """Function cleans up reddit data using deEmojify, replacing line breaks, etc."""
    # Removing emojis/icons/symbols from text
    data['Post'] = data['Post'].apply(deEmojify)
    data['Comment'] = data['Comment'].apply(deEmojify)
    # Removing removed and deleted comments
    deleted_comments = data['Comment'] != "[deleted]"
    removed_comments = data['Comment'] != "[removed]"
    data = data[deleted_comments & removed_comments]
    # Removing cleaned empty posts and comments
    empty_posts = data['Post'] != ""
    empty_comments = data['Comment'] != ""
    data = data[empty_posts & empty_comments]
    # Removing posts with only "."
    dot_posts = data['Post'] != "."
    dot_comments = data['Comment'] != "."
    # Replacing line breaks with spaces in posts and comments
    data['Post'] = data['Post'].str.replace("\n\n", " ")
    data['Post'] = data['Post'].str.replace("\n", " ")
    data['Comment'] = data['Comment'].str.replace("\n\n", " ")
    data['Comment'] = data['Comment'].str.replace("\n", " ")
    # Group comments by specific thread posts
    grouped_comment_trees = data.groupby('Post_ID')
    # Pull top comments in each group
    top_comments_in_group = grouped_comment_trees.max()
    top_comments_in_group = top_comments_in_group.reset_index()
    return top_comments_in_group

In [38]:
# Selecting subreddit to extract from
subreddit_name = 'therapy'
num_posts = 500

In [39]:
# Getting data as pandas dataframe
reddit_posts = reddit_extractor(subreddit_name, num_posts)

0it [00:00, ?it/s]

In [40]:
reddit_posts

Unnamed: 0,Post_ID,Post,Comment,Score
0,hl93cq,“I feel like escapism is your biggest coping m...,Some therapists are so pure,172
1,hl93cq,“I feel like escapism is your biggest coping m...,These are beautiful words. I guess somehow we ...,52
2,i2qfev,"I don’t know how you do it, but thank you!",Needed to read this before starting my week to...,89
3,i2qfev,"I don’t know how you do it, but thank you!",Yes!! I’m so fu*king grateful for everything t...,24
4,hrrthh,When you feel anxious that someone is mad at y...,The wonders of Cognitive Behavioral therapy,120
...,...,...,...,...
992,gevlkd,I started therapy. \nMy mom thinks that I have...,Thanks for sharing that’s a lot to unpack. Rol...,11
993,fvgnyk,"Just realized and, really, it’s the small chan...","oh yes, I remember doing that as well!\n\nyou ...",9
994,fvgnyk,"Just realized and, really, it’s the small chan...","Congrats, man! I’m proud of you! :)",3
995,mzygqq,I'll do my best to not make this too long.\n\n...,Ask them for a 3 session package and use it as...,28


In [41]:
# Cleaning reddit data
reddit_data_cleaned = reddit_data_cleaner(reddit_posts)

In [42]:
reddit_data_cleaned

Unnamed: 0,Post_ID,Post,Comment,Score
0,9pnvpg,"It's not fucking me!! The lack of motivation, ...",Whoa that sent chills down my spine! That’s a ...,19
1,a9ljsw,All my relatives giving me cash to “buy yourse...,This!!!,3
2,ak05zi,"When I’m particularly feeling crappy, I’ll go ...",Whenever I have a particularly hard time one o...,21
3,blzpqs,"In case no one has told you this, it’s okay to...",thank you so much for this. ❤️,8
4,bo7q1c,Title says it all. I'm so mad at myself. Can't...,My therapist used to ask me if I really didn’t...,18
...,...,...,...,...
482,ntr0kl,I really hate it. They generally ask when an e...,I'm gobsmacked that you even tell people tbh b...,28
483,nu71fd,"Hey, I’m 15 I’ve been extremely poor my whole ...","Your not alone, I'm 27m and I grew up in extr...",159
484,numfg5,I'm just discovering HOW MUCH I'm used to igno...,YES I’ve been a lot better about listening to ...,23
485,nv1nxv,I (18) started therapy 3 years ago because of ...,"gosh, what a shitty person. if you’re not will...",98


In [43]:
# Saving data as csv file
today = pd.to_datetime('now').date()
reddit_data_cleaned.to_csv('{}_{}.csv'.format(today, subreddit_name), 
                           index=False)

In [44]:
# Selecting multiple subreddits
subreddit_name_list = ['askmen', 'askwomen', 'askscience', 
                       'confession', 'fitness', 'food', 'jokes', 
                       'letstalkmusic', 'lifeprotips', 'machinelearning', 
                       'math', 'gaming']
num_posts = 500

In [45]:
for subreds in subreddit_name_list:
    reddit_posts = reddit_extractor(subreds, num_posts)
    reddit_data_cleaned = reddit_data_cleaner(reddit_posts)
    today = pd.to_datetime('now').date()
    reddit_data_cleaned.to_csv('{}_{}.csv'.format(today, subreds), 
                               index=False)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Forbidden: received 403 HTTP response

In [52]:
subreddit_name_list_2 = ['relationships', 'roastme', 
                         'science', 'sex', 'todayilearned', 
                         'totallynotrobots', 'travel', 'wouldyourather',
                         'writingprompts', 'zen']
num_posts = 500

In [53]:
for subreds in subreddit_name_list_2:
    reddit_posts = reddit_extractor(subreds, num_posts)
    reddit_data_cleaned = reddit_data_cleaner(reddit_posts)
    today = pd.to_datetime('now').date()
    reddit_data_cleaned.to_csv('{}_{}.csv'.format(today, subreds), 
                               index=False)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]