# Creating Our Own Reddit Datasets

In [23]:
# import what we need
import praw

import pandas as pd

## Getting Reddit Data

In order to get data from the reddit API, you'll need to authenticate with the following:

- `client_id`
- `client_secret`
- `user_agent`

To get this info, go here  https://www.reddit.com/prefs/apps  and click **create (another) app**.
(You'll need to have a reddit account to do this).

Once you've done this, the informatino is stored as follows:

![reddit_app](../img/reddit_app_image.png)

For the sake of demo, you can store the info in the variables below, but be careful not to push the code where others can see it! (worst-case scenario is someone users your reddit account, idk if that matters but yeah)

In [3]:
client_id = ""
client_secret = ""
user_agent = ""

In [12]:
class ScrapeReddit():
    """
    Scrape reddit threads & comments via praw.
    """
    
    def __init__(self, client_id, client_secret, user_agent):
        # probably bad to store api creds as attributes? it's a demo whatever
        self.client_id = client_id
        self.client_secret = client_secret
        self.user_agent = user_agent
        self.reddit = self.connect_to_api()
        
    def connect_to_api(self):
        """
        Connect to reddit API.
        """
        reddit = praw.Reddit(client_id=self.client_id, 
                             client_secret=self.client_secret, 
                             user_agent=self.user_agent)
        return reddit
    
    def get_thread_df(self, subreddit, n):
        """
        Create dataframe of top n posts for a given subreddit.
        """
        posts = []
        subred = self.reddit.subreddit(subreddit)
        for post in subred.top(limit=n):
            posts.append([post.title, post.id, post.subreddit, post.selftext])
        thread_df = pd.DataFrame(posts, columns=['title', 'id', 'subreddit', 'body'])
        return thread_df
    
    def get_comment_submissions(self, subreddit_df):
        """
        Get top comments for a given subreddit.
        """
        subreddit_comments = []
        for thread_id in subreddit_df['id']:
            submission = self.reddit.submission(id=thread_id)
            # Allow recursive comment collection
            submission.comments.replace_more(limit=0)
            # collect tuples of (thread, comment)
            submission_comments = [(thread_id, sub.body) for sub in submission.comments.list()]
            subreddit_comments.append(submission_comments)
        # unnest all comments
        unnested_comments = [coms for thread in subreddit_comments for coms in thread]
        # coerce to pandas dataframe
        comment_df = pd.DataFrame(unnested_comments, columns=['id', 'comment'])
        return comment_df
    
    def make_subreddit_df(self, subreddit, n, csv=None):
        """
        Create subreddit dataframe.
        """
        thread_df = self.get_thread_df(subreddit, n)
        comment_df = self.get_comment_submissions(thread_df)
        subreddit_df = pd.merge(thread_df, comment_df, on='id', how='inner')
        if csv:
            subreddit_df.to_csv(csv)
        return subreddit_df
            

Now, just use the class I wrote above to scrape whatever subreddit you want.
(You may or may not hit rate-limits. It's usually decently quick).

In [14]:
# authenticate API
api = ScrapeReddit(client_id, client_secret, user_agent)

### Grab data for a few different subreddits

In [20]:
# r/statistics
statistics_df = api.make_subreddit_df('statistics', 100, csv="../data/statistics.csv")

In [21]:
# r/usc
usc_df = api.make_subreddit_df('usc', 100, csv="../data/usc.csv")

In [17]:
# r/DunderMifflin
office_df = api.make_subreddit_df('DunderMifflin', 100, csv="../data/dundermifflin.csv")

In [18]:
# r/overwatch
overwatch_df = api.make_subreddit_df('overwatch', 100, csv="../data/overwatch.csv")

In [19]:
# r/cirkeltrek
dutch_df = api.make_subreddit_df('dutch', 100, csv="../data/dutch.csv")

### Preview some of the data

In [22]:
usc_df.head()

Unnamed: 0,title,id,subreddit,body,comment
0,Class of 2023!,b4iyil,USC,,I remember that day like yesterday. Best day o...
1,Class of 2023!,b4iyil,USC,,Congrats! The day I received mine was one of t...
2,Class of 2023!,b4iyil,USC,,I used to work in the financial aid/admissions...
3,Class of 2023!,b4iyil,USC,,Congrats!
4,Class of 2023!,b4iyil,USC,,Congratulations! Welcome! If you have any ques...
