# 1. Libraries and Configuration

In [1]:
import configparser
import pandas as pd
from datetime import datetime
from psaw import PushshiftAPI
import praw
from tqdm import tqdm

## 1.1 Configure PSAW

- __A python wrapper for pushshift.io__

In [2]:
api = PushshiftAPI()

## 1.2 Configure PRAW

In [3]:
config = configparser.ConfigParser()
config.read('../praw_config.ini')

r_praw = praw.Reddit(client_id=config['praw_credentials']['client_id'], 
                     client_secret=config['praw_credentials']['secret'], 
                     redirect_uri='http://localhost:8080',
                     user_agent='chatbot')

# 3. Scrape Submissions and Comments

In [4]:
# Subreddits to scrape 
subreddits = ['depression','anxiety', 'affirmations','BodyAcceptance', 'OCD']

# Initialize dictionary for storing scraped data
scraped_dict = {'author':[],
                'score':[],
                'created':[],
                'subreddit':[],
                'title':[],
                'body':[],
                'id':[],
                'comment_author':[],
                'comment_body':[],
                'comment_score':[],
                'comment_edited':[]}

In [5]:
for subreddit in subreddits:
    posts = list(api.search_submissions(limit=10000, subreddit=subreddit, sort='desc', sort_type='score'))

    for post in tqdm(posts):
    
        # scrape comment data
        submission = r_praw.submission(id=post.id)
        submission.comments.replace_more(limit=0)

        for comment in submission.comments:
            # scrape submission specific data
            scraped_dict['author'].append(post.author)
            scraped_dict['score'].append(post.score)
            scraped_dict['created'].append(post.created_utc)
            scraped_dict['subreddit'].append(post.subreddit)
            scraped_dict['title'].append(post.title)
            try:
                scraped_dict['body'].append(post.selftext)
            except:
                scraped_dict['body'].append(None)
            scraped_dict['id'].append(post.id)
            
            # scrape comment data
            scraped_dict['comment_author'].append(comment.author)
            scraped_dict['comment_body'].append(comment.body)
            scraped_dict['comment_score'].append(comment.score)
            scraped_dict['comment_edited'].append(comment.edited)
            
    scraped_df = pd.DataFrame(scraped_dict)        
    scraped_df.to_csv('../data/scraped_data/reddit_submission_comments_iter.csv')



  0%|          | 0/5652 [00:00<?, ?it/s]  0%|          | 1/5652 [00:00<1:14:32,  1.26it/s]  0%|          | 2/5652 [00:02<1:27:15,  1.08it/s]  0%|          | 3/5652 [00:02<1:10:42,  1.33it/s]  0%|          | 4/5652 [00:03<1:19:01,  1.19it/s]  0%|          | 5/5652 [00:05<1:48:33,  1.15s/it]  0%|          | 6/5652 [00:06<1:40:12,  1.06s/it]  0%|          | 7/5652 [00:06<1:24:09,  1.12it/s]  0%|          | 8/5652 [00:07<1:16:25,  1.23it/s]  0%|          | 9/5652 [00:08<1:18:12,  1.20it/s]  0%|          | 10/5652 [00:09<1:35:01,  1.01s/it]  0%|          | 11/5652 [00:11<1:55:51,  1.23s/it]  0%|          | 12/5652 [00:12<1:43:16,  1.10s/it]  0%|          | 13/5652 [00:12<1:28:34,  1.06it/s]  0%|          | 14/5652 [00:13<1:32:21,  1.02it/s]  0%|          | 15/5652 [00:15<1:50:08,  1.17s/it]  0%|          | 16/5652 [00:16<2:00:39,  1.28s/it]  0%|          | 17/5652 [00:17<1:40:58,  1.08s/it]  0%|          | 18/5652 [00:17<1:18:47,  1.19it/s]  0%|          | 19/5652 [00:19

In [6]:
scraped_df = pd.DataFrame(scraped_dict)
scraped_df

Unnamed: 0,author,score,created,subreddit,title,body,id,comment_author,comment_body,comment_score,comment_edited
0,allthatglitters221,5863,1564030318,depression,"For every upvote I get on this post, I'll stay...",I know it's baity. But honestly I'm desperate ...,chj5ux,,[removed],357,False
1,allthatglitters221,5863,1564030318,depression,"For every upvote I get on this post, I'll stay...",I know it's baity. But honestly I'm desperate ...,chj5ux,,[removed],595,1.56403e+09
2,allthatglitters221,5863,1564030318,depression,"For every upvote I get on this post, I'll stay...",I know it's baity. But honestly I'm desperate ...,chj5ux,,[removed],86,False
3,allthatglitters221,5863,1564030318,depression,"For every upvote I get on this post, I'll stay...",I know it's baity. But honestly I'm desperate ...,chj5ux,,[removed],112,False
4,allthatglitters221,5863,1564030318,depression,"For every upvote I get on this post, I'll stay...",I know it's baity. But honestly I'm desperate ...,chj5ux,,[removed],55,False
...,...,...,...,...,...,...,...,...,...,...,...
335642,[deleted],8,1247760550,OCD,What's up lonely little OCD subreddit?,[deleted],91sxp,,[deleted],2,False
335643,[deleted],8,1247760550,OCD,What's up lonely little OCD subreddit?,[deleted],91sxp,orblivion,"Hey, I found you. Guess this subreddit wasn't ...",1,False
335644,[deleted],4,1252145513,OCD,I have a question.,[removed],9hkj4,adolfojp,I used Paxil and Celexa. Paxil was awesome... ...,1,1.26663e+09
335645,GillyRoze,3,1257206986,OCD,Is involuntarily Right Clicking Your Mouse Whi...,,a0bn4,orblivion,"If you really mean involuntary, then I would s...",3,False


# 4. Wrangle Data

## 4.1 Convert 'created' from unix time to datetime object

In [7]:
scraped_df['created'] = scraped_df['created'].apply(lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))

In [8]:
scraped_df['subreddit'].value_counts()

depression        152498
Anxiety           120113
OCD                48716
BodyAcceptance     13655
affirmations         665
Name: subreddit, dtype: int64

In [9]:
scraped_df.to_csv('../data/scraped_data/reddit_submissions_comments.csv')


In [10]:
scraped_df

Unnamed: 0,author,score,created,subreddit,title,body,id,comment_author,comment_body,comment_score,comment_edited
0,allthatglitters221,5863,2019-07-25 04:51:58,depression,"For every upvote I get on this post, I'll stay...",I know it's baity. But honestly I'm desperate ...,chj5ux,,[removed],357,False
1,allthatglitters221,5863,2019-07-25 04:51:58,depression,"For every upvote I get on this post, I'll stay...",I know it's baity. But honestly I'm desperate ...,chj5ux,,[removed],595,1.56403e+09
2,allthatglitters221,5863,2019-07-25 04:51:58,depression,"For every upvote I get on this post, I'll stay...",I know it's baity. But honestly I'm desperate ...,chj5ux,,[removed],86,False
3,allthatglitters221,5863,2019-07-25 04:51:58,depression,"For every upvote I get on this post, I'll stay...",I know it's baity. But honestly I'm desperate ...,chj5ux,,[removed],112,False
4,allthatglitters221,5863,2019-07-25 04:51:58,depression,"For every upvote I get on this post, I'll stay...",I know it's baity. But honestly I'm desperate ...,chj5ux,,[removed],55,False
...,...,...,...,...,...,...,...,...,...,...,...
335642,[deleted],8,2009-07-16 16:09:10,OCD,What's up lonely little OCD subreddit?,[deleted],91sxp,,[deleted],2,False
335643,[deleted],8,2009-07-16 16:09:10,OCD,What's up lonely little OCD subreddit?,[deleted],91sxp,orblivion,"Hey, I found you. Guess this subreddit wasn't ...",1,False
335644,[deleted],4,2009-09-05 10:11:53,OCD,I have a question.,[removed],9hkj4,adolfojp,I used Paxil and Celexa. Paxil was awesome... ...,1,1.26663e+09
335645,GillyRoze,3,2009-11-03 00:09:46,OCD,Is involuntarily Right Clicking Your Mouse Whi...,,a0bn4,orblivion,"If you really mean involuntary, then I would s...",3,False
